# 2. Tuning Part 1

### for SVM and Log Regression

In [1]:
import pandas as pd
import numpy as np

import re
import nltk
import copy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import plot_confusion_matrix

# Clean and Split

In [2]:
Train = pd.read_csv('./Data/Train_Preprocessed.csv')
Future = pd.read_csv('./Data/Future_Preprocessed.csv')

In [3]:
def cleanAndSplit(df, mode):
    
    text_list = list()
    
    for tweet in df['text']:

        tmp = list()

        if mode == 'NoHashtag':
            tweet = re.sub(r'[^\w%\'-]', ' ', tweet)

            # get rid of non-english characters
            tweet = re.sub(r'[^\x00-\x7F ]+', ' ', tweet) 
        
        else:
        
            # get rid of non-alphanumerical chars, effectively splitting them
            # except for at ' - # and %
            tweet = re.sub(r'[^\x00-\x7F#%\'-]', ' ', tweet)


        # split the words based on space
        words = tweet.split()

        for word in words:

            # get rid of ' and % and - that are outside of words
            word = word.strip('\'%-')
            
            # gets rid of other noise
            if mode == 'NoHashtag':
                word = re.sub(r'[^-\w\']', '', word)
            
            else:
                word = re.sub(r'[^\w#-\']', '', word)


            if word not in ['', '#']:
                tmp.append(word)
                
        text_list.append(tmp)
    
    return text_list
        

In [4]:
TrainWords = Train.copy(deep=True)
TrainWords['text'] = cleanAndSplit(Train, 'Hashtag')

FutureWords = Future.copy(deep=True)
FutureWords['text'] = cleanAndSplit(Future, 'Hashtag')



## Stemming

In [5]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stem_text(text):
    return [ps.stem(w) for w in text]

## Stopwords

In [6]:
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))

# def remove_stopwords(text):
#     return [w for w in text if not w in stop_words]


## Dataset


In [7]:
def join_string(text):
    return ' '.join(text)

Y_PreproTFIDF = TrainWords['sentiment'] # because drop one duplicate 


## Preprocess TFIDF
# Train
TrainPreproTFIDF = TrainWords.copy(deep = True)
TrainPreproTFIDF['text'] = TrainPreproTFIDF['text'].apply(stem_text)
TrainPreproTFIDF['text'] = TrainPreproTFIDF['text'].apply(join_string)
tfidf_vectorizer_Prepro = TfidfVectorizer(lowercase=False, ngram_range=(1,1))
X_PreproTFIDF = tfidf_vectorizer_Prepro.fit_transform(TrainPreproTFIDF['text'])


# Future
FuturePreproTFIDF = FutureWords.copy(deep = True)
FuturePreproTFIDF['text'] = FuturePreproTFIDF['text'].apply(stem_text)
FuturePreproTFIDF['text'] = FuturePreproTFIDF['text'].apply(join_string)
X_Future_PreproTFIDF = tfidf_vectorizer_Prepro.transform(FuturePreproTFIDF['text'])



# ##UNPROCESSED only TFIDF
# # Train
# TrainTFIDF = pd.read_csv("./Data/Train.csv")

# Y_TFIDF = TrainTFIDF['sentiment']

# tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1,1))
# X_TrainTFIDF = tfidf_vectorizer.fit_transform(TrainTFIDF['text'])

# # Future
# FutureTFIDF = pd.read_csv("./Data/Future.csv")
# X_Future_TFIDF = tfidf_vectorizer.transform(FutureTFIDF['text'])



# ##UNPROCESSED TFIDF + rid junk
# # Train
# TrainNoJunkTFIDF = TrainTFIDF.copy(deep = True)
# TrainNoJunkTFIDF['text'] = cleanAndSplit(TrainNoJunkTFIDF, 'Hashtag')
# tfidf_vectorizer_NoJunk = TfidfVectorizer(lowercase=False, ngram_range=(1,1))
# TrainNoJunkTFIDF['text'] = TrainNoJunkTFIDF['text'].apply(join_string)
# X_TrainNoJunkTFIDF = tfidf_vectorizer_NoJunk.fit_transform(TrainNoJunkTFIDF['text'])


# # Future
# FutureNoJunkTFIDF = FutureTFIDF.copy(deep = True)
# FutureNoJunkTFIDF['text'] = cleanAndSplit(FutureNoJunkTFIDF, 'Hashtag')
# FutureNoJunkTFIDF['text'] = FutureNoJunkTFIDF['text'].apply(join_string)
# X_Future_NoJunkTFIDF = tfidf_vectorizer_NoJunk.transform(FutureNoJunkTFIDF['text'])




# # W2V
# TrainW2V = pd.read_csv('./Data/W300V_Train.csv')

# FutureW2V = pd.read_csv('./Data/W300V_Future.csv')

# MODELS

## Evaluation

In [None]:
def evaluate(y_test, y_pred):
    eval_ = pd.DataFrame({'p': y_pred, 't': y_test})
    
    y_test = list(y_test)
    
    n = len(y_test)
    n1 = sum([1 for i in range(len(y_test)) if y_test[i] == 'positive'])
    n2 = sum([1 for i in range(len(y_test)) if y_test[i] == 'neutral'])
    n3 = sum([1 for i in range(len(y_test)) if y_test[i] == 'negative'])
    
    predPosd = [ len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'positive')])/n,
               len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'neutral')])/n,
               len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'negative')])/n
              ]
    predNeud = [ len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'positive')])/n,
               len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'neutral')])/n,
               len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'negative')])/n
              ]
    predNegd = [ len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'positive')])/n,
               len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'neutral')])/n,
               len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'negative')])/n
              ]
    
    predPos = [ len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'positive')]),
               len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'neutral')]),
               len(eval_[(eval_['p'] == 'positive') & (eval_['t'] == 'negative')])
              ]
    predNeu = [ len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'positive')]),
               len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'neutral')]),
               len(eval_[(eval_['p'] == 'neutral') & (eval_['t'] == 'negative')])
              ]
    predNeg = [ len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'positive')]),
               len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'neutral')]),
               len(eval_[(eval_['p'] == 'negative') & (eval_['t'] == 'negative')])
              ]
    
    accuracy = predPos[0] + predNeu[1] + predNeg[2]
    
    error_rate = 1-accuracy
    
    confus_matrix = pd.DataFrame({'predPos': predPos, 'predNeu': predNeu, 'predNeg': predNeg}, 
                                 index = ['truePos', 'trueNeu', 'trueNeg'])
    confus_matrix_d = pd.DataFrame({'predPos': predPosd, 'predNeu': predNeud, 'predNeg': predNegd}, 
                                 index = ['truePos', 'trueNeu', 'trueNeg'])
    
    # error_reduction_rate = error_rate - 
    
    precision1 = 0 if (sum(predPos)) == 0 else predPos[0]/(sum(predPos))
    precision2 = 0 if (sum(predNeu)) == 0 else predNeu[1]/(sum(predNeu))
    precision3 = 0 if (sum(predNeg)) == 0 else predNeg[2]/(sum(predNeg))
    
    recall1 = 0 if (predPos[0]+predNeu[0]+predNeg[0]) == 0 else predPos[0]/(predPos[0]+predNeu[0]+predNeg[0])
    recall2 = 0 if (predPos[1]+predNeu[1]+predNeg[1]) == 0 else predNeu[1]/(predPos[1]+predNeu[1]+predNeg[1])
    recall3 = 0 if (predPos[2]+predNeu[2]+predNeg[2]) == 0 else predNeg[2]/(predPos[2]+predNeu[2]+predNeg[2])
    
    f11 = 0 if (precision1+recall1) == 0 else (2*precision1*recall1)/(precision1+recall1)
    f12 = 0 if (precision2+recall2) == 0 else (2*precision2*recall2)/(precision2+recall2)
    f13 = 0 if (precision3+recall3) == 0 else (2*precision3*recall3)/(precision3+recall3)
    
    # specificity1 = 
    # specificity2 = 
    # specificity3 = 
    
    col1 = [precision1, recall1, f11]
    col2 = [precision2, recall2, f12]
    col3 = [precision3, recall3, f13]
    
    precision = (n1/n)*precision1 + (n2/n)*precision2 + (n3/n)*precision3
    recall = (n1/n)*recall1 + (n2/n)*recall2 + (n3/n)*recall3
    f1 = (n1/n)*f11 + (n2/n)*f12 + (n3/n)*f13
    
    scores = pd.DataFrame({'Pos': col1, 'Neu': col2, 'Neg': col3}, index = ['precision', 'recall', 'f1'])
    
    return confus_matrix, confus_matrix_d, scores, accuracy/n, precision, recall, f1

## Setup Data

In [8]:
X_ = copy.deepcopy(X_PreproTFIDF) # 有preprocess，TFIDF
Y_ = copy.deepcopy(Y_PreproTFIDF) # 1 用

# X_ = copy.deepcopy(X_TrainTFIDF) # 无preprocess，TFIDF

# X_ = copy.deepcopy(X_TrainNoJunkTFIDF) # 无preprocess，稍微清理了一下标点和不规则符号

# Y_ = copy.deepcopy(Y_TFIDF) # 2 和 3 用


# X_ = copy.deepcopy(TrainW2V) # W2V，你可以ignore

In [None]:
# X_F = copy.deepcopy(X_Future_PreproTFIDF) # 有preprocess，TFIDF

# X_F = copy.deepcopy(X_Future_TFIDF) # 无preprocess，TFIDF

X_F = copy.deepcopy(X_Future_NoJunkTFIDF) # 无preprocess，稍微清理了一下标点和不规则符号

# X_F = copy.deepcopy(FutureW2V) # W2V，你可以ignore

In [None]:
Future = pd.read_csv('./Data/Test.csv')

## 0. 0R

Just run one (Probably should not do train test split, but this was done as a model case for the rest to follow - also probablistically speaking should not make a huge difference)

In [67]:
Train_0R = pd.read_csv('./Data/Train.csv')

X_0R = Train_0R['text']
y_0R = Train_0R['sentiment']


In [68]:
ZeroR = DummyClassifier(strategy = "most_frequent")
X_train, X_test, y_train, y_test = train_test_split(X_0R, y_0R, train_size = 0.8, random_state = 30027)
ZeroR.fit(X_train, y_train)
y_pred = ZeroR.predict(X_test)

In [8]:
plot_confusion_matrix(ZeroR, y_pred, y_test, labels = ['negative', 'neutral', 'positive'])

In [70]:
c_matrix, c_matrix_d, score, accuracy, precision, recall, f1 = evaluate(y_test, y_pred)
print('Confusion Matrix: ')
display(c_matrix)
print('Confusion Matrix Decimal: ')
display(c_matrix_d)
print('Score Matrix: ')
display(score)
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1: ', f1)
print('\n')


Confusion Matrix: 


Unnamed: 0,predPos,predNeu,predNeg
truePos,0,1112,0
trueNeu,0,2539,0
trueNeg,0,710,0


Confusion Matrix Decimal: 


Unnamed: 0,predPos,predNeu,predNeg
truePos,0.0,0.254987,0.0
trueNeu,0.0,0.582206,0.0
trueNeg,0.0,0.162807,0.0


Score Matrix: 


Unnamed: 0,Pos,Neu,Neg
precision,0.0,0.582206,0.0
recall,0.0,1.0,0.0
f1,0.0,0.735942,0.0


Accuracy:  0.5822059160742948
Precision:  0.33896372871190883
Recall:  0.5822059160742948
f1:  0.42846980316308253




Cross Validation

In [80]:
k = 5
kf = KFold(n_splits = k, shuffle = True, random_state = 42)
acc_score = []
precision_score = []
recall_score = []
f1_score = []

for train_index, test_index in kf.split(Train_0R):
    X_train, X_test = X_0R.iloc[train_index], X_0R.iloc[test_index]
    y_train, y_test = y_0R.iloc[train_index], y_0R.iloc[test_index]
    
    ZeroR = DummyClassifier(strategy = "most_frequent")
    ZeroR.fit(X_train, y_train)
    
    y_pred = ZeroR.predict(X_test)
    
    c_matrix, c_matrix_d, score, accuracy, precision, recall, f1 = evaluate(y_test, y_pred)
    
    acc_score.append(accuracy_score(y_test, y_pred))
    precision_score.append(precision)
    recall_score.append(recall)
    f1_score.append(f1)
    
    # print('Confusion Matrix: ')
    # display(c_matrix)
    # print('Confusion Matrix Decimal: ')
    # display(c_matrix_d)
    # print('Score Matrix: ')
    # display(score)
    # print('Accuracy: ', accuracy)
    # print('Precision: ', precision)
    # print('Recall: ', recall)
    # print('f1: ', f1)
    # print('\n')

In [73]:
acc_score

[0.5812886952533822,
 0.5819766108690667,
 0.5759174311926606,
 0.5866972477064221,
 0.5772935779816514]

In [74]:
print(np.mean(acc_score))

0.5806347126006366


In [75]:
print(np.var(acc_score))

1.4479728121201361e-05


In [76]:
print(np.std(acc_score))

0.0038052237938393796


In [81]:
print(np.mean(precision_score))

0.33715114920494504


In [78]:
print(np.mean(recall_score))

0.5806347126006366


In [79]:
print(np.mean(f1_score))

0.4265912398638325


In [102]:
Future_0R = pd.read_csv('./Data/Future.csv')
ypred_0R_Future = ZeroR.predict(Future_0R)

out_0R = copy.deepcopy(Future_0R)
out_0R = out_0R.drop(columns = ['text'])
out_0R['sentiment'] = ypred_0R_Future

out_0R.to_csv('./Predictions/0R.csv', index = False)

## 1. Logistic Regression

In [None]:
LR_Run = LogisticRegression(max_iter = 5000, penalty = 'l1', solver = 'saga', C = 2, multi_class = 'multinomial')

X_train, X_valtest, y_train, y_valtest = train_test_split(X_, Y_, train_size = 0.7, random_state = 30027)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

LR_Run.fit(X_train, y_train)
accuracy_score(y_val, LR_Run.predict(X_val))

In [None]:
accuracy_score(y_train, LR_Run.predict(X_train))

In [None]:
accuracy_score(y_test, LR_Run.predict(X_test))

With Feature Selection

In [18]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'LR'
DTYPE = 'TFIDF_NoJunk'

cent_storage_cols = {'C': [], 'penalty': [], 'solver': [], 'fs':[], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [2]:

Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for c in range(-9, 11):
    C = 10**((c))
    
    for PS in [('saga', 'elasticnet'), 
               ('saga', 'l1'), ('saga', 'l2'), ('saga', 'none')]:
    # for PS in [('newton-cg', 'l2'), ('newton-cg', 'none'), ('lbfgs', 'l2'), 
    #            ('lbfgs', 'none'), ('liblinear', 'l1'), ('liblinear', 'l2'), 
    #            ('sag', 'l2'), ('sag', 'none'), ('saga', 'elasticnet'), 
    #            ('saga', 'l1'), ('saga', 'l2'), ('saga', 'none')]:

        switch = True
        i = 0
        X = copy.deepcopy(X_)
        y = copy.deepcopy(Y_)

        while switch:
            X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
            X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
            X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
            X2.fit(X_train, y_train)
            X_train = X2.transform(X_train)
            X_val = X2.transform(X_val)
            X_test = X2.transform(X_test)


            clf = LogisticRegression(penalty=PS[1], C = C, solver = PS[0], l1_ratio = 1)
            
            clf.fit(X_train, y_train)
            cent_storage_cols = {'C': [C], 'penalty': [PS[1]], 'solver': [PS[0]], 
                                 'fs':[int(X.shape[1]*0.25)+i], 
                            'train_accuracy': [clf.score(X_train, y_train)], 
                                 'val_accuracy': [clf.score(X_val, y_val)],
                                'test_accuracy': [clf.score(X_test, y_test)]}

            Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

            Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

            i += 1000
            if int(X.shape[1]*0.25)+i >= X.shape[1]:
                switch = False

                    
                    

Without Feature Selection

In [18]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'LR'
DTYPE = 'TFIDF_Preprocessed'

cent_storage_cols = {'alpha': [], 'penalty': [], 'solver': [], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [7]:
Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for c in range(-3, 10):
    C = 10**(-(c))
    
    for PS in [('lbfgs', 'l2'), 
               ('lbfgs', 'none')]:

        X = copy.deepcopy(X_)
        y = copy.deepcopy(Y_)

        X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
        X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
        # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
        # X2.fit(X_train, y_train)
        # X_train = X2.transform(X_train)
        # X_val = X2.transform(X_val)

        clf = LogisticRegression(penalty=PS[1], C = C, solver = PS[0], max_iter = 5000)
        
        clf.fit(X_train, y_train)
        cent_storage_cols = {'C': [C], 'penalty': [PS[1]], 'solver': [PS[0]], 
                                'train_accuracy': [clf.score(X_train, y_train)], 
                                     'val_accuracy': [clf.score(X_val, y_val)],
                                    'test_accuracy': [clf.score(X_test, y_test)]}

        Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

        Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)


                    

## 2. Random Forests

In [10]:
RF_Run = RandomForestClassifier(criterion = 'gini', n_estimators = 50, max_depth = 10, max_features = 0.5, max_samples = 0.25)

X_train, X_valtest, y_train, y_valtest = train_test_split(X, Y, train_size = 0.7, random_state = 30027)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

RF_Run.fit(X_train, y_train)
accuracy_score(y_val, RF_Run.predict(X_val))


0.6195718654434251

In [11]:
accuracy_score(y_train, RF_Run.predict(X_train))

0.6293165585479327

With Feature Selection

In [13]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns
MODEL_NAME = 'RF'
DTYPE = 'W2V3000'

cent_storage_cols = {'crit': [], 'n_est': [], 'max_depth': [], 
                        'max_feat': [], 'max_sampl': [], 'fs':[], 
                                'train_accuracy': [], 'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:


Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for crit in ('entropy',):
    for n_est in (100,):
        for max_depth in (4, 7, 10):
            for max_feat in (0.5, 0.75):
                for max_sample in (0.5, 0.75):
                    
                    switch = True
                    i = 0
                    X = copy.deepcopy(X_)
                    y = copy.deepcopy(Y_)


                    while switch:
                        X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                        X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                        
                        X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                        X2.fit(X_train, y_train)
                        
                        X_train = X2.transform(X_train)
                        X_val = X2.transform(X_val)
                        X_test = X2.transform(X_test)
                        
                        clf = RandomForestClassifier(criterion = crit, n_estimators = n_est, max_samples = max_sample, max_features = max_feat, max_depth = max_depth)
                        clf.fit(X_train, y_train)
                        
                        cent_storage_cols = {'crit': [crit], 'n_est': [n_est], 'max_depth': [max_depth], 
                                                             'max_feat': [max_feat], 'max_sampl': [max_sample], 'fs':[int(X.shape[1]*0.25)+i], 
                                                             'train_accuracy': [clf.score(X_train, y_train)], 'val_accuracy': [clf.score(X_val, y_val)],
                                                                'test_accuracy': [clf.score(X_test, y_test)]}
                        
                        Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                        Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)
                        
                        switch = False
                        # i += 1000
                        # if int(X.shape[1]*0.25)+i >= X.shape[1]:
                        #     switch = False

                    
                    

Without Feature Selection

In [17]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'RF'
DTYPE = 'W2V3000'

cent_storage_cols = {'crit': [], 'n_est': [], 'max_depth': [], 
                        'max_feat': [], 'max_sampl': [], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:
Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for crit in ('entropy', 'gini'):
    for n_est in (100,):
        for max_depth in (4, 7, 10):
            for max_feat in (0.5, 0.75):
                for max_sample in (0.5, 0.75):
                    
                    X = copy.deepcopy(X_)
                    y = copy.deepcopy(Y_)


                    
                    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                    # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                    # X2.fit(X_train, y_train)
                    # X_train = X2.transform(X_train)
                    # X_val = X2.transform(X_val)
                    clf = RandomForestClassifier(criterion = crit, n_estimators = n_est, max_samples = max_sample, max_features = max_feat, max_depth = max_depth)
                    clf.fit(X_train, y_train)
                    cent_storage_cols = {'crit': [crit], 'n_est': [n_est], 'max_depth': [max_depth], 
                                                         'max_feat': [max_feat], 'max_sampl': [max_sample], 
                                                         'train_accuracy': [clf.score(X_train, y_train), 
                                                        'val_accuracy': [clf.score(X_val, y_val)], 'test_accuracy':[clf.score(X_test, y_test)}

                    Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                    Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)
                        

                    
                    

## 3. XGB

In [14]:
xgb_run = xgb.XGBClassifier(booster = 'gbtree', gamma = 0.01, subsample = 0.75, colsample_bytree = 0.75, max_depth = 10, eta = 0.25)

X_train, X_valtest, y_train, y_valtest = train_test_split(X_, Y_, train_size = 0.7, random_state = 30027)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

y_train = y_train.replace(['negative', 'neutral', 'positive'], [0, 1, 2])
y_val = y_val.replace(['negative', 'neutral', 'positive'], [0, 1, 2])

xgb_run.fit(X_train, y_train)
accuracy_score(y_val, xgb_run.predict(X_val))

0.6045871559633027

In [15]:
accuracy_score(y_train, xgb_run.predict(X_train))

0.9996068152031454

With Feature Selection

In [9]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'XGB'
DTYPE = 'W2V3000'

cent_storage_cols = {'gamma': [], 'sub_sample': [], 'col_sample_by_tree': [], 
                             'max_depth': [], 'eta':[], 'fs':[], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [1]:

Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for gamma in (10, 1, 0.1):
    for sub_sample in (0.5, 0.75):
        for col_sample_by_tree in (0.5, 0.75):
            for max_depth in (4, 7, 10):
                for eta in (0.2,):

                    switch = True
                    i = 0
                    X = copy.deepcopy(X_)
                    y = copy.deepcopy(Y_)
                    y = y.replace(['negative', 'neutral', 'positive'], [0, 1, 2])

                    while switch:
                        X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                        X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                        # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                        # X2.fit(X_train, y_train)
                        # X_train = X2.transform(X_train)
                        # X_val = X2.transform(X_val)
                        # X_test = X2.transform(X_test)
    
    
                        clf = xgb.XGBClassifier(booster = 'gbtree', gamma = gamma, subsample = sub_sample, colsample_bytree = col_sample_by_tree,
                                               max_depth = max_depth, eta = eta)
                        clf.fit(X_train, y_train)
                        cent_storage_cols = {'gamma': [gamma], 'sub_sample': [sub_sample], 'col_sample_by_tree': [col_sample_by_tree], 
                             'max_depth': [max_depth], 'eta':[eta], 'fs':[int(X.shape[1]*0.25)+i], 
                                        'train_accuracy': [clf.score(X_train, y_train)], 
                                             'val_accuracy': [clf.score(X_val, y_val)],
                                            'test_accuracy': [clf.score(X_test, y_test)]}

                        Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                        Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)
                        
                        switch = False
                        # i += 2500
                        # if int(X.shape[1]*0.25)+i >= X.shape[1]:
                        #     switch = False

                    
                    

Without Feature Selection

In [None]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'XGB'
DTYPE = 

cent_storage_cols = {'gamma': [], 'sub_sample': [], 'col_sample_by_tree': [], 
                             'max_depth': [], 'eta':[], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:
Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for gamma in (0, 0.01, 0.05, 0.1, 0.5, 1):
    for sub_sample in (0.25, 0.5, 0.75):
        for col_sample_by_tree in (0.25, 0.5, 0.75):
            for max_depth in (4, 6, 8, 10):
                for eta in (0.1, 0.15, 0.2, 0.25, 0.3):

                    X = copy.deepcopy(X_train_W2V)
                    y = copy.deepcopy(Y_train_processed)
                    y = y.replace(['negative', 'neutral', 'positive'], [0, 1, 2])

                    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                    # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                    # X2.fit(X_train, y_train)
                    # X_train = X2.transform(X_train)
                    # X_val = X2.transform(X_val)

                    clf = xgb.XGBClassifier(booster = 'gbtree', gamma = gamma, subsample = sub_sample, colsample_bytree = col_sample_by_tree,
                                           max_depth = max_depth, eta = eta)
                    clf.fit(X_train, y_train)
                    cent_storage_cols = {'gamma': [gamma], 'sub_sample': [sub_sample], 'col_sample_by_tree': [col_sample_by_tree], 
                         'max_depth': [max_depth], 'eta':[eta], 'train_accuracy': [clf.score(X_train, y_train)], 
                                 'val_accuracy': [clf.score(X_val, y_val)],
                                'test_accuracy': [clf.score(X_test, y_test)]}

                    Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                    Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)


                    

## 4. SVM

In [16]:
svm_run = svm.SVC(C= 10, decision_function_shape= 'ovo', degree= 1, gamma= 0.1, kernel= 'poly')

X_train, X_valtest, y_train, y_valtest = train_test_split(X, Y, train_size = 0.7, random_state = 30027)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

svm_run.fit(X_train, y_train)
accuracy_score(y_val, svm_run.predict(X_val))

0.5241590214067279

In [18]:
accuracy_score(y_train, svm_run.predict(X_train))

0.9836828309305373

With Feature Selection

In [None]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'SVM'
DTYPE = 'W2V3000'

cent_storage_cols = {'C': [], 'kernel': [], 'gamma': [], 'degree': [], 'fs':[], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:
Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for c in range(-3, 11):
    C = 10**(-(c))
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
        for g in range(-3,11):
            gamma = 10**(-(g))
            
            for j in range(1,6):
                if kernel != 'linear':
                    u = j
                    if u>=2:
                        break
                    else:
                        u = None

                switch = True
                i = 0
                X = copy.deepcopy(X_)
                y = copy.deepcopy(Y_)

                while switch:
                    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                    # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                    # X2.fit(X_train, y_train)
                    # X_train = X2.transform(X_train)
                    # X_val = X2.transform(X_val)
                    # X_test = X2.transform(X_test)

                    clf = SVC(C = C, kernel = kernel, gamma = gamma, degree = u)

                    clf.fit(X_train, y_train)
                    cent_storage_cols = {'C': [C], 'kernel': [kernel], 'gamma': [gamma], 'degree': [u], 
                                         'fs':[int(X.shape[1]*0.25)+i], 
                                            'train_accuracy': [clf.score(X_train, y_train)], 
                                                 'val_accuracy': [clf.score(X_val, y_val)],
                                                'test_accuracy': [clf.score(X_test, y_test)]}

                    Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                    Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)
                    
                    switch = False
                    # i += 1000
                    # if int(X.shape[1]*0.25)+i >= X.shape[1]:
                    #     switch = False

                    
                    

Without Feature Selection

In [None]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'SVM'
DTYPE = 

cent_storage_cols = {'C': [C], 'kernel': [kernel], 'gamma': [gamma], 'degree': [i], 
                     'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:
Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for c in range(-10, 11):
    C = 10**(-(c))
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
        for g in range(-10,11):
            gamma = 10**(-(g))
            
            for i in range(1,10):
            

                X = copy.deepcopy(X_)
                y = copy.deepcopy(Y_)

                X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
                X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
                # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
                # X2.fit(X_train, y_train)
                # X_train = X2.transform(X_train)
                # X_val = X2.transform(X_val)

                clf = xgb.XGBClassifier(booster = 'gbtree', gamma = gamma, subsample = sub_sample, colsample_bytree = col_sample_by_tree,
                                       max_depth = max_depth, eta = eta)
                clf.fit(X_train, y_train)
                cent_storage_cols = {'C': [C], 'kernel': [kernel], 'gamma': [gamma], 'degree': [i], 
                                            'train_accuracy': [clf.score(X_train, y_train)], 
                                                 'val_accuracy': [clf.score(X_val, y_val)],
                                                'test_accuracy': [clf.score(X_test, y_test)]}

                Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

                Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)


                    

## 5. Bernoulli NB

In [None]:
bnb_run = BNB(alpha=1.0, fit_prior=True)

X_train, X_valtest, y_train, y_valtest = train_test_split(X_, Y_, train_size = 0.7, random_state = 30027)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

# y_train = y_train.replace(['negative', 'neutral', 'positive'], [0, 1, 2])
# y_val = y_val.replace(['negative', 'neutral', 'positive'], [0, 1, 2])

bnb_run.fit(X_train, y_train)
accuracy_score(y_val, bnb_run.predict(X_val))

In [None]:
accuracy_score(y_train, bnb_run.predict(X_train))

With Feature Selection

In [9]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'BNB'
DTYPE = 'W2V3000'

cent_storage_cols = {'alpha': [], 'fs':[], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [10]:

Central_Statistics = pd.read_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv')
for a in range(-2, 11):
    alpha = 10**(-(a))

    switch = True
    i = 0
    X = copy.deepcopy(X_)
    y = copy.deepcopy(Y_)

    while switch:
        X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
        X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
        # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
        # X2.fit(X_train, y_train)
        # X_train = X2.transform(X_train)
        # X_val = X2.transform(X_val)
        # X_test = X2.transform(X_test)


        clf = BNB(alpha=alpha, fit_prior=True)

        clf.fit(X_train, y_train)
        cent_storage_cols = {'alpha': [alpha], 'fs':[int(X.shape[1]*0.25)+i], 
                        'train_accuracy': [clf.score(X_train, y_train)], 
                             'val_accuracy': [clf.score(X_val, y_val)],
                            'test_accuracy': [clf.score(X_test, y_test)]}

        Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

        Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)
        
        switch = False
        # i += 1000
        # if int(X.shape[1]*0.25)+i >= X.shape[1]:
        #     switch = False

                    
                    

Without Feature Selection

In [None]:
# First create a dictionary of empty lists which is a placeholder for each of the 40-ish columns

MODEL_NAME = 'BNB'
DTYPE = 

cent_storage_cols = {'alpha': [], 'train_accuracy': [],
                    'val_accuracy': [], 'test_accuracy':[]}

# Create DataFrame and export

central_data = pd.DataFrame(cent_storage_cols)
central_data.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)

In [None]:
Central_Statistics = pd.read_csv(f'Tuning {MODEL_NAME} {DTYPE}.csv')
for a in range(-10, 11):
    alpha = 10**(-(a))

    X = copy.deepcopy(X_)
    y = copy.deepcopy(Y_)
    y = y.replace(['negative', 'neutral', 'positive'], [0, 1, 2])

    X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
    X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
    # X2 = SelectKBest(chi2, k=int(X.shape[1]*0.25)+i)
    # X2.fit(X_train, y_train)
    # X_train = X2.transform(X_train)
    # X_val = X2.transform(X_val)

    clf = BNB(alpha=alpha, fit_prior=True)
    
    clf.fit(X_train, y_train)
    cent_storage_cols = {'alpha': [alpha], 
                            'train_accuracy': [clf.score(X_train, y_train)], 
                                 'val_accuracy': [clf.score(X_val, y_val)],
                                'test_accuracy': [clf.score(X_test, y_test)]}

    Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))

    Central_Statistics.to_csv(f'./Tuning/Tuning - {MODEL_NAME} {DTYPE}.csv', index = False)


                    

# Early Code for determining which preprocessing combination was best

## SVM

In [3]:

# accu_score = list()
# X = copy.deepcopy(X_train_processed)
# y = copy.deepcopy(Y_train_processed)

# X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
# X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)
# clf = svm.SVC(kernel = 'linear')

# clf.fit(X_train, y_train)

# accu_score.append(clf.score(X_val, y_val))

        

# print("SVM")
# print(accu_score)

## LogR

In [4]:
# import copy
# from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA
# accu_score = list()

# X = copy.deepcopy(X_train_processed)
# y = copy.deepcopy(Y_train_processed)

# X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
# X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

# # lgr = LogisticRegression(max_iter = 500, penalty = 'l1', solver = 'saga', C = 2, multi_class = 'multinomial')
# # 0.67217125382263 accuracy just playing arround
# lgr = LogisticRegression(max_iter = 500, penalty = 'l2', solver = 'sag', C = 2)
# lgr.fit(X_train, y_train)
# accu_score.append(lgr.score(X_val, y_val))

# print('Logistic without feature selection')
# print(accu_score)

## RANDOM FOREST WITHOUT FEATURE SELECTION

In [5]:
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# accu_score = list()

# X = copy.deepcopy(X_train_processed)
# y = copy.deepcopy(Y_train_processed)

# X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
# X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

# clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# clf.fit(X_train, y_train)
# accu_score.append(clf.score(X_val, y_val))

# print('Random Forest without feature selection')
# print(accu_score)

## XGB

In [6]:
# import xgboost as xgb
# import copy
# from sklearn.model_selection import train_test_split
# accu_score = list()

# X = copy.deepcopy(X_train_processed)
# y = copy.deepcopy(Y_train_processed)
# y = y.replace(['negative', 'neutral', 'positive'], [0, 1, 2])


# X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, train_size = 0.7, random_state = 30027)
# X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, train_size = 0.5, random_state = 30027)

# clf = xgb.XGBClassifier(booster = 'gbtree')
# clf.fit(X_train, y_train)
# accu_score.append(clf.score(X_val, y_val))
   


# print("XGB without feature selection")
# max_value = max(accu_score)
# max_index = accu_score.index(max_value)
# print(max_value)