In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef
import pandas as pd
import logging
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("../data/labeled/combined.csv")
electronics = df.groupby(df.category).get_group("Electronics")
pet = df.groupby(df.category).get_group("Pet supplies")
baby = df.groupby(df.category).get_group("Baby")
sports = df.groupby(df.category).get_group("Sport outdoors")

In [3]:
logging.basicConfig(level=logging.ERROR)
logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [4]:
def evaluation(train_df,test_df):
    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=6, overwrite_output_dir= True, no_save=True,silent=True,  save_best_model=False,  save_eval_checkpoints=False
                                    , save_model_every_epoch=False, save_optimizer_and_scheduler= False)

    # Create a ClassificationModel
    model = ClassificationModel(
        "roberta", "roberta-base", args=model_args
    )

    # Train the model
    model.train_model(train_df)
    targets = test_df.label.to_list()
    predictions, raw_outputs = model.predict(test_df.sentence.to_list())
    f1 = f1_score(targets, predictions, average="macro")
    acc = accuracy_score(targets, predictions)
    mcc = matthews_corrcoef(targets,predictions)
    return f1, acc, mcc

In [5]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2)
data = []

for train_index , test_index in kf.split(baby):
    data_df = baby
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["baby",f1,acc, mcc])
print("baby done")

for train_index , test_index in kf.split(pet):
    data_df = pet
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["pet",f1,acc, mcc])
print("pet done")

for train_index , test_index in kf.split(sports):
    data_df = sports
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["sports",f1,acc, mcc])
print("sports done")
  
for train_index , test_index in kf.split(electronics):
    data_df = electronics
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["electronics",f1,acc, mcc])
print("electronics done")
  
df_result = pd.DataFrame(data, columns = ['category', 'f1-score', 'accuracy', 'matthews-corr'])



baby done




pet done




sports done




electronics done


In [6]:
df_result.groupby(df_result.category).mean()

Unnamed: 0_level_0,f1-score,accuracy,matthews-corr
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baby,0.814686,0.817,0.632526
electronics,0.788043,0.795387,0.578524
pet,0.829736,0.8485,0.661963
sports,0.790663,0.80339,0.582217


In [7]:
df_result.to_csv('../results/roberta.csv', index=False)