# Building a Logistic Regression & Gaussian & Multinomial Naive Bayes Models

In [55]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn import metrics

## Load in data and create features and target feaure variables


In [2]:
#read in the data 
data = pd.read_csv('./reddit_datasets/combined_data.csv', index_col = 'Unnamed: 0')

In [3]:
data.head() 

Unnamed: 0,selftext,title,timestamp,sub_reddit,t_s,sent_selftext,sent_title,sent_ts,hour
0,"We were at the park, kids were taking turns go...",My toddler chest-kicked another kid into the a...,2019-07-08 05:13:30,1,My toddler chest-kicked another kid into the a...,0.9861,0.3875,0.9861,5
1,I’m not a good looking guy and I don’t say it ...,The most female interaction I’ve ever had in m...,2019-07-08 08:23:10,1,The most female interaction I’ve ever had in m...,0.9939,0.0,0.9939,8
2,I know a gal who has two kids and is pregnant ...,I think people with genetic conditions that th...,2019-07-08 20:15:16,1,I think people with genetic conditions that th...,-0.9905,0.0,-0.9905,20
3,This is pretty hard for me to post. Ive never ...,I almost killed my step-mom..,2019-07-07 16:50:52,1,I almost killed my step-mom..This is pretty ha...,-0.9976,-0.6378,-0.9978,16
4,"After livelier-than-average sex, my wife said ...",My Wife Asked a Loaded Question and I Lied to Her,2019-07-08 21:46:59,1,My Wife Asked a Loaded Question and I Lied to ...,-0.847,-0.3818,-0.8741,21


In [4]:
#create features
X = data.drop( columns = 'sub_reddit')


In [5]:
X.shape

(1971, 8)

In [6]:
#create target feature
y = data['sub_reddit']

## Building the Logistic Regression Model

In [7]:
#Train test split your data for count vectorizer
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [9]:
#assign my data that will be fed into count_vec
X_vec = X_train['t_s']

In [10]:
#create pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [11]:
#set pipe params
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None],
    'lr__penalty': ['l1','l2']
}
#Instatiate GridSearchCV
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_vec, y_train)
print(gs.best_score_)
#get best params
gs.best_params_



0.7016238159675237


{'cvec__max_df': 0.9,
 'cvec__max_features': 3000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'lr__penalty': 'l2'}

In [12]:
#re-instatiate CountVectorizer with the best params
cvec = CountVectorizer(max_df = 0.9,
                       max_features = 3000,
                       min_df = 3,
                       ngram_range = (1, 2),
                       stop_words = None)

In [13]:
#fit model
cvec.fit(X_vec)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=3000, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
#transform data
final_word_vec = cvec.transform(X['t_s'])

In [15]:
#change vector into pandas DataFrame
X_vec_df = pd.DataFrame(final_word_vec.toarray(), columns=cvec.get_feature_names())
len(X_vec_df)

1971

In [16]:
#concatenate new vectorized data & previous df
new_data =  pd.concat([data, X_vec_df], axis=1)

In [17]:
new_data.shape

(1971, 3009)

In [18]:
data.head()

Unnamed: 0,selftext,title,timestamp,sub_reddit,t_s,sent_selftext,sent_title,sent_ts,hour
0,"We were at the park, kids were taking turns go...",My toddler chest-kicked another kid into the a...,2019-07-08 05:13:30,1,My toddler chest-kicked another kid into the a...,0.9861,0.3875,0.9861,5
1,I’m not a good looking guy and I don’t say it ...,The most female interaction I’ve ever had in m...,2019-07-08 08:23:10,1,The most female interaction I’ve ever had in m...,0.9939,0.0,0.9939,8
2,I know a gal who has two kids and is pregnant ...,I think people with genetic conditions that th...,2019-07-08 20:15:16,1,I think people with genetic conditions that th...,-0.9905,0.0,-0.9905,20
3,This is pretty hard for me to post. Ive never ...,I almost killed my step-mom..,2019-07-07 16:50:52,1,I almost killed my step-mom..This is pretty ha...,-0.9976,-0.6378,-0.9978,16
4,"After livelier-than-average sex, my wife said ...",My Wife Asked a Loaded Question and I Lied to Her,2019-07-08 21:46:59,1,My Wife Asked a Loaded Question and I Lied to ...,-0.847,-0.3818,-0.8741,21


In [19]:
#assigns features and target features
X = new_data.drop( columns = ['sub_reddit',
                              'selftext',
                              'title',
                              'timestamp',
                              't_s'])
y= new_data['sub_reddit']


In [20]:
#train test split new data for logistic reg
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [21]:
#instatiate model
logreg = LogisticRegression()

In [22]:
#fit model
logreg.fit(X_train, y_train)

#print out intercept and coefficients
print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')



Logistic Regression Intercept: [1.07218056]
Logistic Regression Coefficient: [[ 0.5852295   0.1579413  -0.27862373 ...  0.25263656  0.49679009
   0.11522927]]


In [24]:
#check if it generates predicted values
print(f'Logreg predicted values: {logreg.predict(X_train.head())}')

Logreg predicted values: [1 0 0 0 1]


In [25]:
#check if it generates predicted probabilities
print(f'Logreg predicted probabilities: {logreg.predict_proba(X_train.head())}')

Logreg predicted probabilities: [[4.88112952e-02 9.51188705e-01]
 [9.95557485e-01 4.44251535e-03]
 [8.11963378e-01 1.88036622e-01]
 [9.99610354e-01 3.89645834e-04]
 [6.01247027e-02 9.39875297e-01]]


In [26]:
y_pred =  logreg.predict(X_test)

In [27]:
#find cross val score mean
cross_val_score(logreg, X_train, y_train, cv = 5).mean()




0.6954545454545454

In [29]:
#check actual accuracy score
metrics.accuracy_score(y_test, y_pred)

0.6973886328725039

## Build a K Nearest Neighbors Classifier

In [30]:
#instantiate
knn = KNeighborsClassifier()

In [31]:
#check crossval score
cross_val_score(KNeighborsClassifier(n_neighbors = 5), X_train, y_train, cv = 5).mean()

0.6030303030303029

In [45]:
X_train.head()

Unnamed: 0,sent_selftext,sent_title,sent_ts,hour,10,10 years,100,11,12,13,...,you to,you ve,you want,you were,you will,you you,young,younger,your,yourself
287,-0.9726,-0.7717,-0.9753,9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,-0.8175,0.0,-0.8175,2,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1956,0.4242,-0.5413,-0.0727,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1345,-0.9695,-0.5106,-0.9695,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
736,0.7598,-0.4588,0.5463,11,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Build Gaussian and Multinomial Naive Bayes Models

In [47]:
#drop non discrete data
X_train.drop( columns = ['sent_selftext','sent_title','sent_ts'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [52]:
#drop non discrete data
X_test.drop( columns = ['sent_selftext','sent_title','sent_ts'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:
#instantiate Gaussian model
gnb = GaussianNB()

In [None]:
#fit
gnb.fit(X_train , y_train)

In [53]:
y_pred =  gnb.predict(X_test)

In [54]:
metrics.accuracy_score(y_test, y_pred)

0.6390168970814132

In [56]:
#instiate Multinomial model
mnb = MultinomialNB()

In [57]:
mnb.fit(X_train , y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
y_pred =  mnb.predict(X_test)

In [59]:
metrics.accuracy_score(y_test, y_pred)

0.717357910906298