In [1]:
#imports the basic data science lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# import display module
from IPython.display import display
#import stratify kfold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# import svc,knn    
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
# nlp 
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re
import gensim
# https://github.com/alexandres/lexvec#pre-trained-vectors
# parameter tuning
import optuna
#import typing
from typing import List, Dict, Tuple, Set, Union, Optional, Callable, Any


In [2]:
#Reads the data from the csv file
x_train = pd.read_csv("./Dataset/x_train.csv",header=None,names=['website','text'])
y_train = pd.read_csv("./Dataset/y_train.csv",header=None,names=['positive'])
x_test = pd.read_csv("./Dataset/x_test.csv",header=None,names=['website',"text"])
y_test = pd.read_csv("./Dataset/y_test.csv",header=None,names=['positive'])

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
display(x_train.head())
display(y_train.head())

(2400, 2) (2400, 1) (600, 2) (600, 1)


Unnamed: 0,website,text
0,amazon,Oh and I forgot to also mention the weird colo...
1,amazon,THAT one didn't work either.
2,amazon,Waste of 13 bucks.
3,amazon,"Product is useless, since it does not have eno..."
4,amazon,None of the three sizes they sent with the hea...


Unnamed: 0,positive
0,0
1,0
2,0
3,0
4,0


In [3]:
#create a text pre processing class


class TextPreprocessing:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.bag_of_words = []
        self.word_counts = {}
        pass
    def text_cleaning(self,text:str):

        re_s = [
            #remove the html tags
            (r'<.*?>',''),
            #remove the urls
            (r'http\S+|www.\S+',''),
            #remove the emails
            (r'\S+@\S+',''),
            #remove the new line
            (r'\n',''),
            #remove the special characters
            (r'[^\w\s]',''),
            #remove the numbers
            (r'\d+',''),
            #remove the stop words
            (r'\b\w{1,2}\b',''),
            #remove the extra spaces
            (r'\s+',' ')
        ] 

        for regex in re_s:
            text = re.sub(regex[0],regex[1],text)
        #convert the text to lower case
        text = text.lower()
        return text
    def text_stemming(self,text:str):
        #create the stemmer object
        stemmer = self.stemmer
        #stem the text
        text = " ".join([stemmer.stem(word) for word in text.split()])
        return text
    def text_lemmatization(self,text:str):
        #create the lemmatizer object
        lemmatizer = self.lemmatizer
        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        return text

    def text_tokenization(self,text:str):
        #tokenize the text
        text = text.split()
        return text
    def text_bag_of_words(self,text:str):
        #create the bag of words
        token = self.text_tokenization(text)
        self.bag_of_words.extend(token)
        self.bag_of_words = list(set(self.bag_of_words))
        self.text_word_counts(token)
        return self.bag_of_words
    def text_word_counts(self,token:List[str]):
        #create the word counts
        # text = self.text_tokenization(text)
        for word in token:
            if word in self.word_counts:
                self.word_counts[word] += 1
            else:
                self.word_counts[word] = 1
        return self.word_counts
    def __call__(self,text:str,bow:bool=True):
        text = self.text_cleaning(text)
        text = self.text_lemmatization(text)
        if bow:
            self.text_bag_of_words(text)

        return text
    def get_bag_of_words(self):
        return self.bag_of_words,self.word_counts
# define stratify k fold
def stratified_kfold_cross_validation(x_train,y_train,clf,n_splits=3,):
    skf = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
    evals = []
    for train_index, test_index in skf.split(x_train["vec"],y_train["positive"]):
        _x_train,_x_val = x_train["vec"].iloc[train_index],x_train["vec"].iloc[test_index]
        _y_train,_y_val = y_train["positive"].iloc[train_index],y_train["positive"].iloc[test_index]
        clf.fit(_x_train,_y_train)
        y_pred = clf.predict(_x_val)
        y_true = _y_val
        evals.append({
            "accuracy":accuracy_score(y_true,y_pred),
            "f1_score":f1_score(y_true,y_pred),
            "precision":precision_score(y_true,y_pred),
            "recall":recall_score(y_true,y_pred)
        })
    return evals


In [4]:
t_preprocess = TextPreprocessing()
model = gensim.models.KeyedVectors.load_word2vec_format('./models/lexvec.enwiki+newscrawl.300d.W.pos.vectors.gz', binary=False)

In [5]:
##define pipeliens for the text preprocessing
def vectorize_text(text):
    #word2vec
    vector = np.zeros(300)
    for word in text.split():
        if word in model:
            vector += model[word]
    return vector
def text_preprocessing_pipeline(df,t_preprocess:TextPreprocessing,bow=True):
    df['processed'] = df['text'].apply(lambda x: t_preprocess(x,bow=bow))

    
    keys = [("vec","vec"+str(i)) for i in range(300)]
    vec_df = pd.DataFrame(columns=keys)
    df = df.join(vec_df)
    df.columns=pd.MultiIndex.from_tuples([('website',"value"),('text','value'),('processed','value')]+keys)
    df[keys] = pd.DataFrame(df[('processed','value')].apply(lambda x: vectorize_text(x)).tolist(), index= df.index)
    #create multi index so vector columns can be accessed easily
    return df

x_train = text_preprocessing_pipeline(x_train,t_preprocess)
x_train.head()

Unnamed: 0_level_0,website,text,processed,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec,vec
Unnamed: 0_level_1,value,value,value,vec0,vec1,vec2,vec3,vec4,vec5,vec6,...,vec290,vec291,vec292,vec293,vec294,vec295,vec296,vec297,vec298,vec299
0,amazon,Oh and I forgot to also mention the weird colo...,and forgot also mention the weird color effect...,0.378426,-0.060268,0.184822,0.15183,-0.297788,0.859612,-0.777484,...,0.469973,-0.304046,-1.065574,-1.171022,-0.422527,1.400082,-0.486773,-0.20214,-0.139517,-0.060523
1,amazon,THAT one didn't work either.,that one didnt work either,0.140225,0.041267,0.009939,0.28228,-0.046985,0.766434,-0.256409,...,0.274708,0.063017,-0.292846,-0.345118,0.10519,0.763756,-0.11828,0.137731,-0.208427,0.076719
2,amazon,Waste of 13 bucks.,waste buck,-0.266811,-0.236225,-0.128274,0.126632,-0.088879,0.085906,0.076802,...,0.243682,-0.011763,0.011722,0.027326,0.110818,0.198335,0.000805,-0.019087,0.027806,0.099427
3,amazon,"Product is useless, since it does not have eno...",product useless since doe not have enough char...,0.444849,-0.627699,-0.006317,0.011628,-0.420637,1.228025,-1.320413,...,0.428378,-0.800171,-0.776894,-2.037737,1.194447,1.492296,-0.052499,-0.863386,0.150174,-0.017175
4,amazon,None of the three sizes they sent with the hea...,none the three size they sent with the headset...,0.39214,0.367241,0.398144,0.112205,-0.57809,1.134998,-0.503192,...,0.310214,-0.245398,-0.622091,-1.304139,-0.282142,0.053096,0.452793,-0.22953,-0.028972,-0.467039


In [6]:
bow, counts = t_preprocess.get_bag_of_words()
print("bag of words counts",len(bow))

bag of words counts 4179


In [7]:
# define models to train on
models = [
    ("Random Forest",RandomForestClassifier(n_estimators=100,random_state=42)),
    ("Logistic Regression",LogisticRegression(random_state=42,solver='lbfgs', max_iter=1000)),
    ("SVM",SVC(random_state=42)),
    ("KNN",KNeighborsClassifier()),
]

In [8]:
# do stratified k fold cross validation on the models
evals = []
for name,clf in models:
    _eval = stratified_kfold_cross_validation(x_train,y_train,clf,n_splits=3)
    _eval = pd.DataFrame(_eval)
    _eval["modelname"] = name
    evals.append(_eval)
#concat the evals
eval_df = pd.concat(evals)
results_df = eval_df.groupby("modelname").mean()
results_df.head()

Unnamed: 0_level_0,accuracy,f1_score,precision,recall
modelname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNN,0.712917,0.727312,0.6929,0.765833
Logistic Regression,0.812083,0.811406,0.814768,0.808333
Random Forest,0.768333,0.759056,0.790798,0.73
SVM,0.842083,0.837358,0.862903,0.813333


In [9]:
#get the best results
results_df.idxmax()

accuracy     SVM
f1_score     SVM
precision    SVM
recall       SVM
dtype: object

In [10]:
#parameter tuning for SVM using optuna
def objective(trial):
    #define the parameters to tune
    params = {
        "C":trial.suggest_float("C",1e-10,1e10),
        "kernel":trial.suggest_categorical("kernel",["linear","rbf"]),
        "gamma":trial.suggest_categorical("gamma",["scale","auto"]),
        "degree":trial.suggest_int("degree",1,5),
        "coef0":trial.suggest_float("coef0",1e-10,1e10)
    }
    #define the model
    clf = SVC(**params,random_state=42)
    #do stratified k fold cross validation
    evals = stratified_kfold_cross_validation(x_train,y_train,clf,n_splits=3)
    evals = pd.DataFrame(evals)
    #get the mean of the evals
    return evals["accuracy"].mean()
#optimize the model
study = optuna.create_study(direction="maximize")
study.optimize(
    objective,
    n_trials=50,#300 seconds or 50 trials whichever comes first
    timeout=300,#300 seconds or 50 trials whichever comes first
    # n_jobs=-1,# use all core
    gc_after_trial=True, #garbage collect after each trial (free up memory)
    show_progress_bar=True
    )

[32m[I 2023-03-24 12:57:37,113][0m A new study created in memory with name: no-name-5c9287d7-2f3a-4a01-b9ac-1dd71550c8aa[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
#train the best model on the whole dataset
clf = SVC(random_state=42)
clf.fit(x_train["vec"],y_train["positive"])

#test the model on the test set
_x_test = text_preprocessing_pipeline(x_test,t_preprocess,bow=False)
y_pred = clf.predict(_x_test["vec"])
y_true = y_test["positive"]


accuracy = accuracy_score(y_true,y_pred)
f1_score = f1_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)
confusion_mat = confusion_matrix(y_true,y_pred)
print("accuracy",accuracy)
print("f1_score",f1_score)
print("precision",precision)
print("recall",recall)
print("confusion matrix",confusion_mat)


