In [None]:
import os

def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)



In [None]:
import pandas as pd
df1 = pd.read_csv('Phrase_Banks/train.csv',engine='python', encoding='unicode_escape')
df1.head()

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device

In [None]:
# For Sprig Categories

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
from datasets import load_from_disk
from urllib.parse import urlparse

def get_custom_dataset(filename='Phrase_Banks/train.csv'):
    def map_to_label(label):
        if label == 'neutral':
            return 1
        elif label == 'positive':
            return 2
        elif label == 'negative':
            return 0
        else:       
            return 1
            
    label_column = 'label'
    
    combined_column = 'selected_text'

    def replace_nan(column, value = ""):
        data[column].fillna(value, inplace = True)

    data = pd.read_csv(filename, on_bad_lines='skip',encoding="unicode_escape")
    print(data.columns)

    data[label_column] = data['sentiment'].map(map_to_label)
    data['out'] = data[combined_column].astype(str)
    texts = data["out"].tolist()
    labels = data[label_column].unique()
    label_to_index = {1.0: 1, 0.0: 0,2.0:2}
    index_to_label = {1: 1.0, 0: 0.0, 2:2.0}
    data["label_index"] = data[label_column].map(label_to_index)
    label_indices = data["label_index"].tolist()
    label_count = len(label_to_index)
    print('label_to_index')
    print(label_to_index)


    print('splitting train data')
    train_texts, temp_texts, train_label_indices, temp_label_indices = train_test_split(texts, label_indices, test_size=0.2, stratify=data[label_column], random_state=42, shuffle=True)
    print('splitting test and validation data')
    val_texts, test_texts, val_label_indices, test_label_indices = train_test_split(temp_texts, temp_label_indices, test_size=0.1, stratify=temp_label_indices, random_state=42, shuffle=True)

    print("train dataset size: " + str(len(train_texts)))
    print("val dataset size: " + str(len(val_texts)))
    print("test dataset size: " + str(len(test_texts)))
    return {"labels" : labels, "label_to_index" : label_to_index, "label_count" : label_count,
            "index_to_label" : index_to_label, "train_texts" : train_texts, "val_texts" : val_texts, "test_texts" : test_texts,
            "train_label_indices" : train_label_indices, "val_label_indices" : val_label_indices, "test_label_indices" : test_label_indices}




In [None]:
def get_custom_dataset_single(filename=""):

    def map_to_label(label):
        if label == 'negative':
            return 1
        elif label == 'neutral':
            return 2
        elif label == 'positive':
            return 0
        else:       
            return 1
            
   
    combined_column = 'selected_text'

    def replace_nan(column, value = ""):
        data[column].fillna(value, inplace = True)

    data = pd.read_csv(filename, on_bad_lines='skip',encoding="unicode_escape")
    print(data.columns)
    data['out'] = data[combined_column].astype(str)
    texts = data["out"].tolist()
    # labels = data[label_column].unique()
    label_to_index = {1.0: 1, 0.0: 0,2.0:2}
    index_to_label = {1: 1.0, 0: 0.0, 2:2.0}
    # data["label_index"] = data[label_column].map(label_to_index)
    label_indices = data["label_index"].tolist()
    label_count = len(label_to_index)
    print('label_to_index')
    print(label_to_index)

    return {
        "texts" : texts,
        "category" : label_indices
    }


In [None]:
params = get_custom_dataset()

In [None]:
train_dataset = {
    "texts" : params["train_texts"],
    "category" : params["train_label_indices"]
}
test_dataset = {
    "texts" : params["test_texts"],
    "category" : params["test_label_indices"]
}
val_dataset = {
    "texts" : params["val_texts"],
    "category" : params["val_label_indices"]
}

In [None]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_dict(train_dataset)
test_ds = Dataset.from_dict(test_dataset)
val_ds = Dataset.from_dict(val_dataset)

dataset_dict = DatasetDict({"train": train_ds, "test": test_ds, "validation":val_ds})

dataset_dict

In [None]:
%%time

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, model_max_length=512)
labels = list(params["label_to_index"].keys())

In [None]:
%%time

def preprocess_data(examples):
    text = examples['texts']
    encoding = tokenizer(text, padding=True, truncation=True)
    encoding['labels'] = examples['category']

    return encoding

encoded_dataset = dataset_dict.map(preprocess_data, batched=True, remove_columns=dataset_dict['train'].column_names)

In [None]:
encoded_dataset.set_format("torch")

In [None]:
print(model_name,
len(labels),
params["index_to_label"],
params["label_to_index"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=len(labels),
                                                           id2label=params["index_to_label"],
                                                           label2id=params["label_to_index"],
                                                           ignore_mismatched_sizes=True)

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
i = 1

def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, macro_f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    print(precision)
    micro_f1 = f1_score(labels, preds, average='micro')
    weighted_f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    print("classification report")
    print(classification_report(labels, preds))

    report = classification_report(labels, preds, output_dict=True)

    report_df = pd.DataFrame(report).transpose()
    global i

    i=i+1

    cm = confusion_matrix(labels, preds)

    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['non_adult', 'adult'])
    cm_display.plot()
    return {
        'Accuracy': acc,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'weighted_f1' : weighted_f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:

from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

batch_size = 1
model.config.pad_token_id = model.config.eos_token_id

lr = 2e-6

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  
    early_stopping_threshold=0.02, 
)

args = TrainingArguments(
    output_dir= './output_logs/' + 'model_roberta_base_phrase_banks',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
)


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset["validation"])

In [None]:
save_path =  "./saved_model/new_mod_" + "1" +'/'  # Change to your desired save path
model.save_pretrained(save_path, safe_serialization=False)
tokenizer.save_pretrained(save_path)

In [None]:
print("running evaluation on test set with confusion matrix")
results = trainer.evaluate(eval_dataset=encoded_dataset["test"])
results