# FORGE

___

Here we are going to do initial model testingon two selected models. K-Nearest Neighbors and Support Vector Machines.

We will fit and tune hyperparameters for those models in this notebook.

# Imports

___

In [3]:
import pickle
import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, mean_squared_error

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from gensim.models.word2vec import Word2Vec
import gensim



from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [4]:
import random
random.seed(42)

In [5]:
with open('corpus.pkl','rb') as clean_pickle:
    corpus = pickle.load(clean_pickle)

In [6]:
corpus.to_csv('./corpus.csv')

___

In [7]:
my_stop_words = ['https','com','www','people','know','actually',
                     'world','time','years','fact','facts','fake','like',
                     'sk','10','en','day','water','did','just','the']
    
    # append custom stopwords to text.ENGLIS_STOP_WORDS
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)
    

## Train Test Split

___

In [8]:
X = corpus['selftext']
y = corpus.subreddit

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   stratify = y ,
                                                   test_size = .33,
                                                   random_state = 42)

# Logistic Regression 

___

## Cvec

In [20]:
logreg = Pipeline([
        ('cvec', CountVectorizer(stop_words=stop_words)),
        ('logreg',LogisticRegression(solver = 'saga'))]);



logreg_params = {
    'cvec__max_df': np.linspace(0.20,0.30,10),
    'cvec__min_df': np.linspace(0.001,1,10),
    'logreg__penalty': ['l1','l2'],
    'logreg__dual': [True,False],
    'logreg__C': np.linspace(0.001,1,10),
    #'logreg__solver': ['newton-cg','lbfgs','liblinear','sag','saga'],
    'logreg__max_iter': [5_000]
}    


grid = GridSearchCV(logreg, logreg_params, cv=5, n_jobs = 6, verbose = 2)

%time
grid.fit(X_train,y_train)

Wall time: 0 ns
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    2.8s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   38.8s
[Parallel(n_jobs=6)]: Done 353 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 847 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 1576 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done 2280 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 3148 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done 4230 tasks      | elapsed:  5.2min
[Parallel(n_jobs=6)]: Done 5410 tasks      | elapsed:  5.6min
[Parallel(n_jobs=6)]: Done 6535 tasks      | elapsed:  7.2min
[Parallel(n_jobs=6)]: Done 8165 tasks      | elapsed:  8.3min
[Parallel(n_jobs=6)]: Done 9660 tasks      | elapsed:  9.3min
[Parallel(n_jobs=6)]: Done 11242 tasks      | elapsed: 11.0min
[Parallel(n_jobs=6)]: Done 13014 tasks      | elapsed: 12.8min
[Parallel(n_jobs=6)]: Done 14938 tasks      | elapsed: 14.6

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=frozenset({'10',
                                                                              'a',
                                                                              'about',
                                                                              'above',
                                                                              'across',
                                                                              'actually',
                                                                              'after',
                                                                              'afterwards',
                                                                              'again',
                                                                              'against',
                                                           

In [21]:
grid.best_estimator_

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.22222222222222224, min_df=0.001,
                                 stop_words=frozenset({'10', 'a', 'about',
                                                       'above', 'across',
                                                       'actually', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another'

In [22]:
grid.best_score_

0.7330433672538936

In [12]:
log_preds = grid.predict(X_test)

In [13]:
summary = classification_report(y_test,log_preds)

In [14]:
print(summary)

              precision    recall  f1-score   support

   FakeFacts       0.74      0.73      0.73       833
       facts       0.77      0.78      0.78       990

    accuracy                           0.76      1823
   macro avg       0.76      0.75      0.75      1823
weighted avg       0.76      0.76      0.76      1823



In [54]:
grid.score(y_test,grid.predict(X_test))

0.7410861217772902

In [97]:
predictions_df = pd.DataFrame()

In [98]:
def _chart_grid(grid,name):
    temp_df = pd.DataFrame()
    probs = grid.predict_proba(X_test)
    preds = grid.predict(X_test)
    
    temp_df[f'{name} Probability'] = [prob[1] for prob in probs]
    temp_df[f'{name} Prediction'] = [pred for pred in log_preds]
    temp_df[f'{name} Binary'] = np.where(temp_df[f'{name} Prediction'] == 'facts',1,0)
    
    return temp_df

In [100]:
predictions_df =  _chart_grid(grid,'LogReg')

In [122]:
y_test = np.where(y_test == 'facts',1,0)

In [123]:
preds = predictions_df['LogReg Binary']

In [124]:
#preds = grid.predict(X_test)
mean_squared_error(y_test,preds)

0.25891387822270984

In [101]:
predictions_df

Unnamed: 0,LogReg Probability,LogReg Prediction,LogReg Binary
0,0.871724,facts,1
1,0.636360,facts,1
2,0.997663,facts,1
3,0.277183,FakeFacts,0
4,0.650310,facts,1
...,...,...,...
1818,0.999900,facts,1
1819,0.280876,FakeFacts,0
1820,0.742594,facts,1
1821,0.627395,facts,1


## Support Vectors

In [None]:
{'svm__C': 0.112,
 'svm__max_iter': 1000,
 'tfidf__max_df': 0.223,
 'tfidf__min_df': 0.001,
 'tfidf__ngram_range': (1, 3),
 'tfidf__norm': 'l2'}

{'svm__C': 0.556,
 'svm__max_iter': 1000,
 'tfidf__max_df': 0.2,
 'tfidf__min_df': 0.0001,
 'tfidf__ngram_range': (1, 3),
 'tfidf__norm': 'l2'}



In [27]:
svm_tf = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words)),
        ('svm', LinearSVC() ) ]);

svm_tf_params = {
    'tfidf__max_df': np.linspace(0.20,0.30,10),
    'tfidf__min_df': np.linspace(0.0001,0.005,10),
    'tfidf__ngram_range': [(1,2),(1,3),(1,4)],
    'tfidf__norm' : ['l1','l2'],
    
    #'svm__loss': ['hinge','squared_hinge'],
    #'svm__dual': [True,False],
    'svm__C': np.linspace(0.001,1,10),
    'svm__max_iter': [1_000,2_000,3_000]
}    
    
grid_2 = GridSearchCV(svm_tf, svm_tf_params, cv=5, n_jobs = 6, verbose = 2)
    #grid = GridSearchCV(logreg, logreg_params, cv=5, n_jobs = 6, verbose = 2)
%time

grid_2.fit(X_train,y_train)

Wall time: 0 ns
Fitting 5 folds for each of 18000 candidates, totalling 90000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    4.8s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   11.2s
[Parallel(n_jobs=6)]: Done 353 tasks      | elapsed:   22.8s
[Parallel(n_jobs=6)]: Done 636 tasks      | elapsed:   38.6s
[Parallel(n_jobs=6)]: Done 1001 tasks      | elapsed:   58.8s
[Parallel(n_jobs=6)]: Done 1446 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 1973 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 2580 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done 3269 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed:  3.8min
[Parallel(n_jobs=6)]: Done 4889 tasks      | elapsed:  4.7min
[Parallel(n_jobs=6)]: Done 5820 tasks      | elapsed:  5.6min
[Parallel(n_jobs=6)]: Done 6833 tasks      | elapsed:  6.5min
[Parallel(n_jobs=6)]: Done 7926 tasks      | elapsed:  7.6min
[Parallel(n_jobs=6)]: Done 9101 tasks      | elapsed:  8.7min

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=frozenset({'10',
                                                                              'a',
                                                                              'about',
                                                                              'above',
                                                                              'across',
                                                                              'actually',
                                                                              'after',
                                                                              'afterwards',
                                                                              'again',
                                                                              'against',
                                                          

In [28]:
grid_2.best_score_

0.7430462851515484

In [29]:
grid_2.best_params_

{'svm__C': 0.556,
 'svm__max_iter': 1000,
 'tfidf__max_df': 0.2,
 'tfidf__min_df': 0.0001,
 'tfidf__ngram_range': (1, 3),
 'tfidf__norm': 'l2'}

In [10]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

In [11]:
logreg = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words)),
        ('MNB',MultinomialNB())]);



logreg_params = {
    'tfidf__max_df': np.linspace(0.20,0.30,10),
    'tfidf__min_df': np.linspace(0.001,1,10),
    'tfidf__tokenizer': [None,tokenizer],
    'tfidf__ngram_range':[(1,1),(1,2),(2,2)],
    'MNB__alpha': np.linspace(0.001,1,10)  
}    


grid_3 = GridSearchCV(logreg, logreg_params, cv=5, n_jobs = 6, verbose = 2)

%time
grid_3.fit(X_train,y_train)

Wall time: 0 ns
Fitting 5 folds for each of 6000 candidates, totalling 30000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.3s
[Parallel(n_jobs=6)]: Done 432 tasks      | elapsed:    8.0s
[Parallel(n_jobs=6)]: Done 1244 tasks      | elapsed:   17.2s
[Parallel(n_jobs=6)]: Done 2376 tasks      | elapsed:   29.8s
[Parallel(n_jobs=6)]: Done 3836 tasks      | elapsed:   46.6s
[Parallel(n_jobs=6)]: Done 5616 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 7724 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 10152 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 12908 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done 15984 tasks      | elapsed:  3.0min
[Parallel(n_jobs=6)]: Done 19388 tasks      | elapsed:  3.6min
[Parallel(n_jobs=6)]: Done 23112 tasks      | elapsed:  4.3min
[Parallel(n_jobs=6)]: Done 27164 tasks      | elapsed:  5.1min
[Parallel(n_jobs=6)]: Done 30000 out of 30000 | elapsed:  5.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=frozenset({'10',
                                                                              'a',
                                                                              'about',
                                                                              'above',
                                                                              'across',
                                                                              'actually',
                                                                              'after',
                                                                              'afterwards',
                                                                              'again',
                                                                              'against',
                                                          

In [13]:
grid_3.best_score_

0.7281850676587518