# FORGE

___

Here we are going to do initial model testingon two selected models. K-Nearest Neighbors and Support Vector Machines.

We will fit and tune hyperparameters for those models in this notebook.

# Imports

___

In [1]:
import pickle
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.svm import LinearSVC

from gensim.models.word2vec import Word2Vec
import gensim



from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

___

## Read in corpus.pkl

In [2]:
with open('corpus.pkl','rb') as clean_pickle:
    corpus = pickle.load(clean_pickle)
    

In [3]:
with open("glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [4]:
def prep_corpus(corpus,column):
    sparse_df = corpus[column]
    
    my_stop_words = ['https','com','www','people','know','actually',
                     'world','time','years','fact','facts','fake','like',
                     'sk','10','en','day','water','did','just']
    
    stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)
   
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    cvec = CountVectorizer(stop_words=stop_words)
    sparse_df = cvec.fit_transform(sparse_df)

    
    sparse_df = pd.DataFrame(sparse_df.todense(), columns = cvec.get_feature_names())
    sparse_df =  sparse_df.loc[(sparse_df.sum(axis=1) > 5), (sparse_df.sum(axis=0) >5)]
    return sparse_df

In [5]:
sparse_df = prep_corpus(corpus,'selftext')

HUD function for viewing summary

In [6]:
# diplays preview of dataframe for checking changes
def disp_hud(hud):
    base_group = corpus.groupby(['fact']).mean()
    head = corpus.head(2)
    
    hud = [base_group,head]
    disp = ['mean','preview']
    
    for i,li in enumerate(hud):
        print(disp[i])
        display(li)
        
#disp_hud(hud)

In [7]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


## Train Test Split

___

In [8]:
def _make_tokens(corpus,column):    
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    corpus[column] = [tokenizer.tokenize(row) for row in corpus[column]]
    return corpus

corpus = _make_tokens(corpus,'selftext')

In [9]:
#corpus['selftext'] = [word_tokenize(word) for word in corpus.selftext]

In [10]:
corpus.selftext

0            [The, origin, word, "yeet", old, english, .]
1       [Did, LEGO, named, phrase, “let’s, ”?, In, 187...
2       [Taking, crust, pizza, considered, disrespectf...
3       [The, Catholic, church, secret, bible, allows,...
4                                     [The, earth, round]
                              ...                        
5519    [Some, species, fish, naturally, change, sex, ...
5520    [Believe, Not, :, Things, need, Justo, Smoker,...
5521    [Heart, Attacks, -, How, avoid, live, healthie...
5522    [The, Indonesian, Psychiatrists, Association, ...
5523    [The, analog, computer, Antikythera, mechanism...
Name: selftext, Length: 5524, dtype: object

In [11]:
X = corpus['selftext']
y = corpus.subreddit

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   stratify = y ,
                                                   test_size = .33,
                                                   random_state = 42)

In [13]:
model = gensim.models.Word2Vec(X)
w2v = dict(zip(model.wv.index_to_key, model.wv))

In [14]:
# reading glove files, this may take a while
# we're reading line by line and only saving vectors
# that correspond to words from our training set
# if you wan't to play around with the vectors and have 
# enough RAM - remove the 'if' line and load everything

import struct 

GLOVE_6B_300D_PATH = "glove.6B.300d.txt"
GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
encoding="utf-8"


glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

            
glove_big = {}
with open(GLOVE_6B_300D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if word in all_words:
            nums=np.array(parts[1:], dtype=np.float32)
            glove_big[word] = nums

# GO 1

In [15]:
svc_1 = Pipeline([
    ('word2vec', MeanEmbeddingVectorizer(w2v)),
    ('support_vectors',LinearSVC(max_iter = 5_000, verbose = 1))
]);

nb_1 = Pipeline([
    ('word2vec', MeanEmbeddingVectorizer(w2v)),
    ('naive',BernoulliNB())
]);

In [16]:
svc_1.fit(X_train,y_train)
nb_1.fit(X_train,y_train)

[LibLinear]

Pipeline(steps=[('word2vec',
                 <__main__.MeanEmbeddingVectorizer object at 0x000001E0C2EC2640>),
                ('naive', BernoulliNB())])

In [17]:
train = cross_val_score(nb_1,X_train,y_train)
test = cross_val_score(nb_1,X_test,y_test)

print(f'''NB Train:{train.mean()} 
NB Test:{test.mean()}''')

train = cross_val_score(svc_1,X_train,y_train)
test = cross_val_score(svc_1,X_test,y_test)

print(f'''svc Train:{train.mean()} 
svc Test:{test.mean()}''')

NB Train:0.5190487653645548 
NB Test:0.5118064127653168
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]svc Train:0.6195583032425137 
svc Test:0.592426614481409


# Go 2

In [18]:
svc_1 = Pipeline([
    ('word2vec', MeanEmbeddingVectorizer(w2v)),
    ('support_vectors',LinearSVC(max_iter = 5_000))
]);

nb_1 = Pipeline([
    ('word2vec', MeanEmbeddingVectorizer(w2v)),
    ('naive',BernoulliNB())
]);

In [19]:
train = cross_val_score(nb_1,X_train,y_train)
test = cross_val_score(nb_1,X_test,y_test)

print(f'''NB Train:{train.mean()} 
NB Test:{test.mean()}''')

train = cross_val_score(svc_1,X_train,y_train)
test = cross_val_score(svc_1,X_test,y_test)

print(f'''svc Train:{train.mean()} 
svc Test:{test.mean()}''')

NB Train:0.5190487653645548 
NB Test:0.5118064127653168
svc Train:0.6195583032425137 
svc Test:0.592426614481409


## Pipelines

In [20]:
from sklearn.svm import LinearSVC

In [21]:
pipe_svm_cv = Pipeline([
    ('cvec', CountVectorizer()),
    ('scv',LinearSVC()),
]);

pipe_svm_tf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('scv',LinearSVC()),
]);



In [22]:
svc = SVC()
cvec = CountVectorizer()
mnb = MultinomialNB()

NameError: name 'SVC' is not defined