# Imports

In [26]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import RandomizedSearchCV

from random import randint

# Config

In [2]:
label_type = '2_way_label' # 2_way_label, 3_way_label, or 6_way_label

selected_model = RandomForestClassifier()

# Pipeline([
#     ('ss', StandardScaler()), ('lr', RandomForestClassifier(random_state=33))
# ])  

# Logistic Regression | Support Vector Classifier | Decision Tree | Random Forest Classifier | GradientBoostingClassifier | RidgeClassifier

# Data Loading

In [3]:
data_vect = pd.read_csv('/content/drive/MyDrive/Fakeddit_Project/count_vectorize_data.csv')

In [4]:
data_embed = pd.read_csv('/content/drive/MyDrive/Fakeddit_Project/embed_data.csv')

In [5]:
embeds = data_embed.drop(columns=['created_utc', 'num_comments',	'score',	'upvote_ratio',	'2_way_label',	'3_way_label',	'6_way_label'])

In [6]:
combined_dataset = pd.concat([embeds, data_vect], axis=1)

combined_dataset = data_vect # data_embed gives high recall low precision, data_vect gives mid both
combined_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,997,998,999,created_utc,num_comments,score,upvote_ratio,2_way_label,3_way_label,6_way_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1565712000.0,0.0,10,0.79,0,2,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1550534000.0,8.0,10,0.67,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1371868000.0,0.0,7,0.99,0,2,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1536611000.0,0.0,8,0.9,0,2,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1437689000.0,0.0,5,0.86,0,2,2


In [7]:
X = combined_dataset.drop(columns=['2_way_label','3_way_label','6_way_label'])
X_train, X_test, y_train, y_test = train_test_split(X, combined_dataset[label_type], test_size=0.25, random_state=42)

print('Label: ', label_type)
print('Dimensions of training and testing data: \n')

print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')

print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

Label:  2_way_label
Dimensions of training and testing data: 

X_train: (1500, 1004)
y_train: (1500,)
X_test: (500, 1004)
y_test: (500,)


# Test RFC Model

In [8]:
print(f'Selected Model: {selected_model}')
selected_model.fit(X_train, y_train)

Selected Model: RandomForestClassifier()


RandomForestClassifier()

In [29]:
def eval_all(model):
  y_pred = model.predict(X_test)

  print('Model:', model)
  print('Label Type:', label_type)
  print('Acc:', accuracy_score(y_test, y_pred))
  print('Precision:', precision_score(y_test, y_pred))
  print('Recall:', recall_score(y_test, y_pred))
  print('F1:', f1_score(y_test, y_pred), '\n')

  print(confusion_matrix(y_test, y_pred))

In [30]:
eval_all(selected_model)

Model: RandomForestClassifier()
Label Type: 2_way_label
Acc: 0.738
Precision: 0.7962962962962963
Recall: 0.7987616099071208
F1: 0.7975270479134466 

[[111  66]
 [ 65 258]]


# Hyperparameters

precision = of all positive predictions, how many are really positive

recall = of all real positive cases, how many are predicted as positive

tn, fp, fn, tp

## RFC

In [55]:
print('Base RFC')
eval_all(selected_model)

Base RFC
Model: RandomForestClassifier()
Label Type: 2_way_label
Acc: 0.738
Precision: 0.7962962962962963
Recall: 0.7987616099071208
F1: 0.7975270479134466 

[[111  66]
 [ 65 258]]


In [51]:
# RFC
random_grid = {'n_estimators': [5, 10, 100, 200, 1000],
               'max_depth': [None, 5, 10, 20, 40, 50, 100],
               'max_features': ['sqrt', 'log2', None, 5, 10, 100, 1000],
               'class_weight': ['balanced', 'balanced_subsample']}
(random_grid)

{'n_estimators': [5, 10, 100, 200, 1000],
 'max_depth': [None, 5, 10, 20, 40, 50, 100],
 'max_features': ['sqrt', 'log2', None, 5, 10, 100, 1000],
 'class_weight': ['balanced', 'balanced_subsample']}

In [52]:
model = RandomForestClassifier()

model_hyp = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1, scoring='f1')

# Fit the random search model
model_hyp.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'class_weight': ['balanced',
                                                         'balanced_subsample'],
                                        'max_depth': [None, 5, 10, 20, 40, 50,
                                                      100],
                                        'max_features': ['sqrt', 'log2', None,
                                                         5, 10, 100, 1000],
                                        'n_estimators': [5, 10, 100, 200,
                                                         1000]},
                   random_state=42, scoring='f1', verbose=2)

In [54]:
eval_all(model_hyp.best_estimator_)

Model: RandomForestClassifier(class_weight='balanced_subsample', max_depth=100,
                       max_features=100, n_estimators=200)
Label Type: 2_way_label
Acc: 0.75
Precision: 0.801829268292683
Recall: 0.8142414860681114
F1: 0.8079877112135176 

[[112  65]
 [ 60 263]]


In [83]:
pd.DataFrame(model_hyp.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,param_loss,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.946866,0.094282,0.018727,0.000428,100,100,50,exponential,0.5,"{'n_estimators': 100, 'max_features': 100, 'ma...",0.817518,0.837905,0.799007,0.782828,0.793103,0.806073,0.019508,3
1,0.408567,0.014422,0.016022,0.00036,10,100,100,exponential,0.1,"{'n_estimators': 10, 'max_features': 100, 'max...",0.847222,0.839907,0.805556,0.800937,0.820513,0.822827,0.018274,2
2,0.869095,0.024239,0.01748,0.000417,100,100,10,deviance,10.0,"{'n_estimators': 100, 'max_features': 100, 'ma...",0.586118,0.353383,0.491329,0.262295,0.712329,0.481091,0.160491,8
3,3.081182,0.54697,0.016936,0.000544,100,,100,exponential,0.5,"{'n_estimators': 100, 'max_features': None, 'm...",0.802956,0.779221,0.759494,0.758794,0.744304,0.768954,0.020307,7
4,0.967287,0.023844,0.021889,0.002271,300,10,10,exponential,0.5,"{'n_estimators': 300, 'max_features': 10, 'max...",0.806045,0.786408,0.779156,0.795,0.786632,0.790648,0.009189,5
5,0.352747,0.008935,0.016484,0.000534,10,10,100,deviance,0.5,"{'n_estimators': 10, 'max_features': 10, 'max_...",0.779221,0.796117,0.775819,0.811224,0.74359,0.781194,0.022678,6
6,3.483649,0.993176,0.029224,0.01777,50,,10,deviance,0.001,"{'n_estimators': 50, 'max_features': None, 'ma...",0.790323,0.790323,0.792757,0.792757,0.792757,0.791783,0.001192,4
7,3.475393,0.150476,0.026328,0.00297,100,log2,100,deviance,0.01,"{'n_estimators': 100, 'max_features': 'log2', ...",0.84,0.803493,0.828508,0.844749,0.82774,0.828898,0.014293,1
8,0.021676,0.002083,0.0,0.0,100,10,100,log_loss,10.0,"{'n_estimators': 100, 'max_features': 10, 'max...",,,,,,,,9
9,0.021414,0.002635,0.0,0.0,100,50,10,log_loss,0.01,"{'n_estimators': 100, 'max_features': 50, 'max...",,,,,,,,10


## GBC

In [75]:
print('Base GBC')
eval_all(GradientBoostingClassifier().fit(X_train, y_train))

Base GBC
Model: GradientBoostingClassifier()
Label Type: 2_way_label
Acc: 0.724
Precision: 0.7453580901856764
Recall: 0.8699690402476781
F1: 0.8028571428571428 

[[ 81  96]
 [ 42 281]]


In [85]:
# Boost
random_grid = {'loss': ['log_loss', 'deviance', 'exponential'],
               'learning_rate': [0.001, 0.01, 0.1, 0.5, 10],
               'n_estimators': [10, 50, 100, 300],
               'max_depth': [3, 10, 50, 100],
               'max_features': [None, 'sqrt', 'log2', 10, 50, 100]}

(random_grid)

{'loss': ['log_loss', 'deviance', 'exponential'],
 'learning_rate': [0.001, 0.01, 0.1, 0.5, 10],
 'n_estimators': [10, 50, 100, 300],
 'max_depth': [3, 10, 50, 100],
 'max_features': [None, 'sqrt', 'log2', 10, 50, 100]}

In [86]:
model = GradientBoostingClassifier()

model_hyp = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1, scoring='f1')

# Fit the random search model
model_hyp.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 282, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'log_loss' not supported. 

 0.79178296 0.8260857         nan        nan]


RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1, 0.5,
                                                          10],
                                        'loss': ['log_loss', 'deviance',
                                                 'exponential'],
                                        'max_depth': [3, 10, 50, 100],
                                        'max_features': [None, 'sqrt', 'log2',
                                                         10, 50, 100],
                                        'n_estimators': [10, 50, 100, 300]},
                   random_state=42, scoring='f1', verbose=2)

In [88]:
eval_all(model_hyp.best_estimator_)

Model: GradientBoostingClassifier(learning_rate=0.01, max_depth=100,
                           max_features='log2')
Label Type: 2_way_label
Acc: 0.71
Precision: 0.7109004739336493
Recall: 0.9287925696594427
F1: 0.8053691275167784 

[[ 55 122]
 [ 23 300]]


In [89]:
pd.DataFrame(model_hyp.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,param_loss,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.081805,0.252882,0.018902,0.000765,100,100,50,exponential,0.5,"{'n_estimators': 100, 'max_features': 100, 'ma...",0.85213,0.830846,0.800995,0.796992,0.778055,0.811804,0.026321,3
1,0.402684,0.015737,0.016873,0.000912,10,100,100,exponential,0.1,"{'n_estimators': 10, 'max_features': 100, 'max...",0.836782,0.824074,0.794457,0.806452,0.799065,0.812166,0.01591,2
2,0.891906,0.023335,0.017532,0.000394,100,100,10,deviance,10.0,"{'n_estimators': 100, 'max_features': 100, 'ma...",0.395973,0.665,0.705882,0.610354,0.612245,0.597891,0.107049,8
3,3.772253,1.321466,0.024798,0.009884,100,,100,exponential,0.5,"{'n_estimators': 100, 'max_features': None, 'm...",0.789082,0.765625,0.736573,0.763819,0.736041,0.758228,0.019996,7
4,1.093047,0.247121,0.020875,0.000883,300,10,10,exponential,0.5,"{'n_estimators': 300, 'max_features': 10, 'max...",0.788945,0.806683,0.774194,0.77551,0.785,0.786066,0.01172,5
5,0.367587,0.017923,0.0177,0.00087,10,10,100,deviance,0.5,"{'n_estimators': 10, 'max_features': 10, 'max_...",0.78481,0.790123,0.772727,0.807882,0.760925,0.783294,0.015909,6
6,2.229948,0.028875,0.016208,0.000321,50,,10,deviance,0.001,"{'n_estimators': 50, 'max_features': None, 'ma...",0.790323,0.790323,0.792757,0.792757,0.792757,0.791783,0.001192,4
7,3.329167,0.17522,0.025728,0.003454,100,log2,100,deviance,0.01,"{'n_estimators': 100, 'max_features': 'log2', ...",0.829596,0.816964,0.828194,0.83871,0.816964,0.826086,0.008277,1
8,0.021751,0.002412,0.0,0.0,100,10,100,log_loss,10.0,"{'n_estimators': 100, 'max_features': 10, 'max...",,,,,,,,9
9,0.019898,0.000425,0.0,0.0,100,50,10,log_loss,0.01,"{'n_estimators': 100, 'max_features': 50, 'max...",,,,,,,,10


## LR

In [90]:
print('Base LR')
lr = Pipeline([
    ('ss', StandardScaler()), ('lr', LogisticRegression()) 
])

eval_all(lr.fit(X_train, y_train))

Base LR
Model: Pipeline(steps=[('ss', StandardScaler()), ('lr', LogisticRegression())])
Label Type: 2_way_label
Acc: 0.646
Precision: 0.7517241379310344
Recall: 0.6749226006191951
F1: 0.7112561174551386 

[[105  72]
 [105 218]]


In [91]:
lr.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ss', 'lr', 'ss__copy', 'ss__with_mean', 'ss__with_std', 'lr__C', 'lr__class_weight', 'lr__dual', 'lr__fit_intercept', 'lr__intercept_scaling', 'lr__l1_ratio', 'lr__max_iter', 'lr__multi_class', 'lr__n_jobs', 'lr__penalty', 'lr__random_state', 'lr__solver', 'lr__tol', 'lr__verbose', 'lr__warm_start'])

In [92]:
random_grid = {'lr__C': [0.1, 1, 10, 100],
               'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'lr__max_iter': [10, 100, 1000]}

(random_grid)

{'lr__C': [0.1, 1, 10, 100],
 'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'lr__max_iter': [10, 100, 1000]}

In [93]:
model = lr

model_hyp = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1, scoring='f1')

# Fit the random search model
model_hyp.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ss', StandardScaler()),
                                             ('lr', LogisticRegression())]),
                   n_jobs=-1,
                   param_distributions={'lr__C': [0.1, 1, 10, 100],
                                        'lr__max_iter': [10, 100, 1000],
                                        'lr__solver': ['newton-cg', 'lbfgs',
                                                       'liblinear', 'sag',
                                                       'saga']},
                   random_state=42, scoring='f1', verbose=2)

In [94]:
eval_all(model_hyp.best_estimator_)

Model: Pipeline(steps=[('ss', StandardScaler()),
                ('lr', LogisticRegression(C=100, max_iter=10, solver='sag'))])
Label Type: 2_way_label
Acc: 0.664
Precision: 0.7384615384615385
Recall: 0.7430340557275542
F1: 0.7407407407407408 

[[ 92  85]
 [ 83 240]]


In [95]:
pd.DataFrame(model_hyp.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__solver,param_lr__max_iter,param_lr__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.21546,0.002161,0.020861,0.002533,newton-cg,10,0.1,"{'lr__solver': 'newton-cg', 'lr__max_iter': 10...",0.722513,0.726368,0.713178,0.763359,0.699752,0.725034,0.021245,4
1,0.286049,0.020707,0.02117,0.002773,newton-cg,100,0.1,"{'lr__solver': 'newton-cg', 'lr__max_iter': 10...",0.722513,0.726368,0.713178,0.763359,0.699752,0.725034,0.021245,4
2,0.268262,0.015377,0.020828,0.001553,lbfgs,100,10.0,"{'lr__solver': 'lbfgs', 'lr__max_iter': 100, '...",0.710875,0.694517,0.691689,0.751295,0.684073,0.70649,0.024049,7
3,0.350916,0.077818,0.025794,0.009298,newton-cg,10,100.0,"{'lr__solver': 'newton-cg', 'lr__max_iter': 10...",0.70137,0.680965,0.677596,0.734043,0.675603,0.693915,0.022069,9
4,7.685577,0.514236,0.015486,0.001332,sag,1000,0.1,"{'lr__solver': 'sag', 'lr__max_iter': 1000, 'l...",0.722513,0.72818,0.713178,0.760204,0.70297,0.725409,0.01939,3
5,2.147768,0.034382,0.01491,0.000671,saga,100,100.0,"{'lr__solver': 'saga', 'lr__max_iter': 100, 'l...",0.705882,0.746867,0.717949,0.73822,0.703242,0.722432,0.01737,6
6,0.220882,0.003326,0.014735,0.000124,sag,10,10.0,"{'lr__solver': 'sag', 'lr__max_iter': 10, 'lr_...",0.700809,0.753117,0.726343,0.743455,0.711443,0.727033,0.019398,2
7,0.227391,0.005582,0.016123,0.001001,sag,10,100.0,"{'lr__solver': 'sag', 'lr__max_iter': 10, 'lr_...",0.699187,0.755,0.734177,0.729659,0.717822,0.727169,0.01844,1
8,0.406875,0.018853,0.016947,0.000485,liblinear,1000,0.1,"{'lr__solver': 'liblinear', 'lr__max_iter': 10...",0.698667,0.71875,0.696335,0.727273,0.685139,0.705233,0.015457,8
9,3.311125,0.500564,0.015451,0.002548,liblinear,1000,100.0,"{'lr__solver': 'liblinear', 'lr__max_iter': 10...",0.7,0.666667,0.6759,0.725806,0.673854,0.688446,0.02179,10


LR, EN, RF