In [1]:
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [2]:
class SVM_Wrapper:
    def __init__(self, mode: str ='tf-idf'):
        self.model = None;
        self.vocab = None;
        self.mode = mode
    
    def train(self, train_df: pd.DataFrame, stop_words = None):
        '''
        Trains a bag of words or tf-idf based SVM on the provided training dataframe
        :param train_df: pandas dataframe with column 'text' and 'label'
        :param mode: when set to 'tf-idf' uses a tf-idf vectorizer, else a bag of words vectorizer
        :return: returns the trained model an the used vectorizer
        '''
        if self.mode=='tf-idf':
            vectorizer = TfidfVectorizer(max_features=2000, stop_words=stop_words)
        else:
            vectorizer = CountVectorizer(max_features=2000, stop_words=stop_words)

        x = train_df['sentence']
        y = train_df['label']
        vectorizer.fit(x)

        # turn text into bag of words / tf-idf vector
        x = vectorizer.transform(x)

        model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
        model.fit(x, y)

        self.model =  model
        self.vocab = vectorizer

    def predict(self, df: pd.DataFrame):
        x = df['sentence']
        y = df['label']
        # turn text into bag of words / tf-idf vector
        x = self.vocab.transform(x)
        prediction = self.model.predict(x)

        return prediction
    
    def evaluation(self, train_df, test_df):
        self.train(train_df)
        pred = self.predict(test_df)
        targets = test_df.label.to_list()
        f1 = f1_score(targets,pred, average="macro")
        acc = accuracy_score(targets, pred)
        mcc = matthews_corrcoef(targets,pred)
        return f1, acc, mcc

In [3]:
class NB_Wrapper:
    def __init__(self, mode: str ='tf-idf'):
        self.model = None;
        self.vocab = None;
        self.mode = mode
    
    def train(self, train_df: pd.DataFrame, stop_words = None):
        if self.mode=='tf-idf':
            vectorizer = TfidfVectorizer(max_features=2000, stop_words=stop_words)
        else:
            vectorizer = CountVectorizer(max_features=2000, stop_words=stop_words)

        x = train_df['sentence']
        y = train_df['label']
        vectorizer.fit(x)

        # turn text into bag of words / tf-idf vector
        x = vectorizer.transform(x)

        model = MultinomialNB()
        model.fit(x, y)
        self.model =  model
        self.vocab = vectorizer

    def predict(self, df: pd.DataFrame):
        x = df['sentence']
        y = df['label']
        # turn text into bag of words / tf-idf vector
        x = self.vocab.transform(x)
        prediction = self.model.predict(x)

        return prediction
    
    def evaluation(self, train_df, test_df):
        self.train(train_df)
        pred = self.predict(test_df)
        targets = test_df.label.to_list()
        f1 = f1_score(targets,pred, average="macro")
        acc = accuracy_score(targets, pred)
        mcc = matthews_corrcoef(targets,pred)
        return f1, acc, mcc

## Load data

In [4]:
from sklearn.model_selection import KFold

In [5]:
df = pd.read_csv("../data/labeled/combined.csv")
electronics = df.groupby(df.category).get_group("Electronics")
pet = df.groupby(df.category).get_group("Pet supplies")
baby = df.groupby(df.category).get_group("Baby")
sports = df.groupby(df.category).get_group("Sport outdoors")

### SVM

In [6]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2)
data = []
model = SVM_Wrapper()

for train_index , test_index in kf.split(baby):
    data_df = baby
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["baby",f1,acc, mcc])
    
for train_index , test_index in kf.split(pet):
    data_df = pet
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["pet",f1,acc, mcc])

for train_index , test_index in kf.split(sports):
    data_df = sports
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["sports",f1,acc, mcc])
    
for train_index , test_index in kf.split(electronics):
    data_df = electronics
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["electronics",f1,acc, mcc])
    
df_result = pd.DataFrame(data, columns = ['category', 'f1-score', 'accuracy', 'matthews-corr'])

In [7]:
df_result.groupby(df_result.category).mean()

Unnamed: 0_level_0,f1-score,accuracy,matthews-corr
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baby,0.764988,0.772,0.531893
electronics,0.733958,0.750357,0.470597
pet,0.77429,0.808,0.551212
sports,0.73016,0.752872,0.462372


In [8]:
df_result.to_csv('../results/svm-idf.csv', index=False)

### NB

In [9]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2)
data = []
model = NB_Wrapper()

for train_index , test_index in kf.split(baby):
    data_df = baby
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["baby",f1,acc, mcc])
    
for train_index , test_index in kf.split(pet):
    data_df = pet
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["pet",f1,acc, mcc])

for train_index , test_index in kf.split(sports):
    data_df = sports
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["sports",f1,acc, mcc])
    
for train_index , test_index in kf.split(electronics):
    data_df = electronics
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = model.evaluation(train_df, test_df)
    data.append(["electronics",f1,acc, mcc])
    
df_result = pd.DataFrame(data, columns = ['category', 'f1-score', 'accuracy', 'matthews-corr'])

In [10]:
df_result.groupby(df_result.category).mean()

Unnamed: 0_level_0,f1-score,accuracy,matthews-corr
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baby,0.7576,0.77,0.527528
electronics,0.650138,0.708338,0.369389
pet,0.625777,0.741,0.346471
sports,0.662842,0.720871,0.370459


In [11]:
df_result.to_csv('../results/NB-idf.csv', index=False)