In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import time
import tqdm
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import transformers
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold, SelectFromModel
from xgboost import XGBClassifier


train = pd.read_csv("./dreaddit-train.csv")
test = pd.read_csv("./dreaddit-test.csv")

train['subreddit'] = le.fit_transform(train['subreddit'])
test['subreddit'] = le.transform(test['subreddit'])
merged = train.append(test)


#############################################
feature_selection = 1 #1 NEW FEATURE SELECTION #2 HOMEWORK FEATURE SELECTION\
rand_st = 1
#############################################

#creating a function
def func_tokenizer(tokenizer_name, docs):
    features = []
    for doc in tqdm.tqdm(docs, desc = 'converting documents to features'):
        tokens = tokenizer_name.tokenize(doc)
        ids = tokenizer_name.convert_tokens_to_ids(tokens)
        features.append(ids)
    return features

def get_models():
    models = list()
    models.append(('RandomForestClassifier', RandomForestClassifier()))
    models.append(('LogisticRegression', LogisticRegression()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('SVC', SVC()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('MLPClassifier', MLPClassifier()))
    models.append(('XGBClassifier', XGBClassifier()))
    return models

tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased')
#Make X have everything except label
new_copy_train = merged.copy()
X,y = new_copy_train.drop(['label','post_id','id','subreddit','sentence_range','text'], axis=1), new_copy_train['label']

selected_features = []
#My feature selection
if feature_selection == 1:
    # Feature selection
    selector = SelectKBest(f_classif, k=10)
    X_new = selector.fit_transform(X, y)

    # Get the support mask
    mask = selector.get_support()

    # Get the selected features
    selected_features = X.columns[mask]

    # Print the selected features
    print("Selected Features: ", selected_features)
if feature_selection == 2:
    sel = VarianceThreshold(threshold=0.5)
    fit_mod=sel.fit(X)
    fitted=sel.transform(X)
    sel_idx=fit_mod.get_support()
    #Get lists of selected and non-selected features (names and indexes)
    header = list(X.columns.values)
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(sel_idx)):
        if sel_idx[i]==True:
            temp.append(header[i])
            temp_idx.append(i)
        else:
            temp_del.append(i)
    selected_features=temp
    sel_features_idx=temp_idx
    del_features=temp_del
    print("Selected Features: ", selected_features)
    print("Deleted Features: ", del_features)


# use selected features to fit the model and add back text column and tokenized text column
X['text'] = train['text']
bert_features = func_tokenizer(tokenizer, X['text'])
bert_trg = pad_sequences(bert_features, maxlen=500, dtype='long', truncating='post', padding='post')
X = pd.DataFrame(bert_trg)

#add back the selected features
for i in range(len(selected_features)):
    X[selected_features[i]] = X_new[:,i]


X.columns = X.columns.astype(str)


scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'}                                                                                                                
model_params = {
    'RandomForestClassifier': {'n_estimators': 250, 'max_depth': None, 'min_samples_split': 3, 'criterion': 'entropy', 'random_state': rand_st},
    'DecisionTreeClassifier': {'max_depth': None, 'min_samples_split': 3, 'criterion': 'entropy', 'random_state': rand_st},
    'SVC': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'random_state': rand_st},
    'KNeighborsClassifier': {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto'},
    'LogisticRegression': {'penalty': 'l2', 'C': 1.0, 'solver': 'sag', 'random_state': rand_st, 'max_iter': 350},
    'GaussianNB': {'var_smoothing': 1e-09},
    'AdaBoostClassifier': {'n_estimators': 100, 'random_state': rand_st},
    'GradientBoostingClassifier': {'n_estimators': 100, 'max_depth': 3, 'random_state': rand_st},
    'MLPClassifier': {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0001, 'batch_size': 'auto', 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'power_t': 0.5, 'max_iter': 200, 'shuffle': True, 'random_state': rand_st, 'tol': 0.0001, 'verbose': False, 'warm_start': False, 'momentum': 0.9, 'nesterovs_momentum': True, 'early_stopping': False, 'validation_fraction': 0.1, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08},
    'XGBClassifier': {'n_estimators': 250, 'max_depth': 3, 'learning_rate': 0.1, 'random_state': rand_st}
}
model_results = []
from sklearn.metrics import accuracy_score
#import for auc score
from sklearn.metrics import roc_auc_score
scores={}
import warnings
warnings.filterwarnings('ignore')


#UPDATE THIS VALUE TO 1 IN ORDER TO RUN CROSS VALIDATION
cross_val = 0
########################################################



if cross_val == 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for models in get_models():
        name, clf = models
        clf.set_params(**model_params[name])
        clf.fit(X_train, y_train)    
        start_ts=time.time()
        y_pred = clf.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_roc_auc = roc_auc_score(y_test, y_pred)
        scores['test_accuracy'] = test_accuracy
        scores['test_roc_auc'] = test_roc_auc
        scores['runtime'] = time.time()-start_ts
        print(name)
        print(scores)
        model_results.append((name, scores))
else:
    for models in get_models():
        name, clf = models
        clf.set_params(**model_params[name])
        clf.fit(X, y)
        start_ts=time.time()
        scores = cross_validate(clf, X, y, scoring=scorers, cv=5)    
        scores_Acc = scores['test_Accuracy']                                                                                                                                    
        print(name,"ACC: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
        scores_AUC= scores['test_roc_auc']                
        print(name,"AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))                           
        print("CV Runtime:", time.time()-start_ts)
        model_results.append((name, scores_Acc.mean(), scores_Acc.std(), scores_AUC.mean(), scores_AUC.std(), time.time()-start_ts))


Selected Features:  Index(['lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_i',
       'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_social',
       'lex_dal_min_pleasantness', 'sentiment'],
      dtype='object')


converting documents to features: 100%|██████████| 3553/3553 [00:12<00:00, 285.30it/s]


RandomForestClassifier
{'test_accuracy': 0.7116736990154712, 'test_roc_auc': 0.7080766652835848, 'runtime': 0.10447049140930176}
LogisticRegression
{'test_accuracy': 0.6216596343178622, 'test_roc_auc': 0.621453193322907, 'runtime': 0.008110523223876953}
KNeighborsClassifier
{'test_accuracy': 0.530239099859353, 'test_roc_auc': 0.5345783728575533, 'runtime': 0.06361985206604004}
DecisionTreeClassifier
{'test_accuracy': 0.6033755274261603, 'test_roc_auc': 0.6010700264913345, 'runtime': 0.00831913948059082}
SVC
{'test_accuracy': 0.5555555555555556, 'test_roc_auc': 0.548438447543966, 'runtime': 0.7769894599914551}
GaussianNB
{'test_accuracy': 0.4711673699015471, 'test_roc_auc': 0.5126113114806422, 'runtime': 0.01600193977355957}
AdaBoostClassifier
{'test_accuracy': 0.7130801687763713, 'test_roc_auc': 0.7122179311225304, 'runtime': 0.12847590446472168}
GradientBoostingClassifier
{'test_accuracy': 0.7144866385372715, 'test_roc_auc': 0.711950623982637, 'runtime': 0.0077402591705322266}
MLPClas