# Models 1 & 2 for predicting subreddit based of the post title

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

%matplotlib inline

In [2]:
df = pd.read_csv('../datasets/concat_data.csv')

In [3]:
df.head()

Unnamed: 0,title,ups,num_comments,subreddit
0,[General Discussion] Around the Horn - 4/1/19,26,174,baseball
1,1919 r/baseball Power Rankings -- Preseason / ...,80,69,baseball
2,WAR is ruining the game of baseball,5106,258,baseball
3,Pete Alonso hits his first major league home r...,581,159,baseball
4,The Cubs have made 6 errors in 6 innings tonight,400,70,baseball


In [4]:
df.tail()

Unnamed: 0,title,ups,num_comments,subreddit
1730,Breaking Down the All-Time Greatest Milwaukee ...,2,0,marchmadness
1731,Every March Madness school ranked by their mos...,1,0,marchmadness
1732,"Norfolk St. and Lehigh beat Mizzou and Duke, r...",4,4,marchmadness
1733,Greg Gumbel photo bomb,0,0,marchmadness
1734,Fuck UConn,0,2,marchmadness


In [5]:
df.shape

(1735, 4)

In [6]:
df.isnull().sum()

title           0
ups             0
num_comments    0
subreddit       0
dtype: int64

In [7]:
df['subreddit'].value_counts()

baseball        924
marchmadness    811
Name: subreddit, dtype: int64

In [8]:
# close to 50%
df['subreddit'].value_counts(normalize = True)

baseball        0.532565
marchmadness    0.467435
Name: subreddit, dtype: float64

# Model 1: CountVectorizer and Logistic Regression

In [9]:
X = df['title']
y = df['subreddit']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)


# Running with default parameters and checking accuracy

In [11]:
pipe = Pipeline([('cvec', CountVectorizer()),
                ('lr', LogisticRegression())
                ])

In [12]:
pipe_params = {}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train);
print(gs.best_score_)
gs.best_params_



0.889315910837817


{}

In [13]:
gs.score(X_train, y_train)

0.9992313604919293

In [14]:
gs.score(X_test, y_test)

0.9078341013824884

In [15]:
preds = gs.predict(X_test)

In [16]:
confusion_matrix(y_test, 
                 preds)

array([[206,  25],
       [ 15, 188]])

In [17]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [18]:
acc = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc,4)}')

Accuracy: 0.9078


### There is an overfit. Will change hyperparameters to improve the model.

In [19]:
pipe = Pipeline([('cvec', CountVectorizer()),
                ('lr', LogisticRegression(C = 2))
                ])

In [20]:
pipe_params = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [200], # train score increases with higher max_features but causes high variance
    'cvec__min_df': [5, 10, 50],
    'cvec__ngram_range': [(1,1)],
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train);
print(gs.best_score_)
gs.best_params_



0.8616448885472713


{'cvec__max_features': 200,
 'cvec__min_df': 5,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [21]:
gs.score(X_train, y_train)

0.9146810146041506

In [22]:
gs.score(X_test, y_test)

0.9101382488479263

In [23]:
preds = gs.predict(X_test)

In [24]:
cm = confusion_matrix(y_test, 
                 preds)

cm_df = pd.DataFrame(cm, columns = ['Pred Neg', 'Pred Pos'],
            index = ['Actual Neg', 'Actual Pos'])
cm_df

In [29]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [30]:
acc = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc,4)}')

Accuracy: 0.9101


In [31]:
spec = tn / (tn + fp)

print(f'Specificity: {round(spec,4)}')

Specificity: 0.8788


In [32]:
sens = tp / (tp + fn)

print(f'Sensitivity: {round(sens,4)}')

Sensitivity: 0.9458


# Compare CountVectorizer with TfidVectorizer

In [33]:
pipe = Pipeline([('tfid', TfidfVectorizer()),
                ('lr', LogisticRegression(C = 10))
                ])

In [34]:
pipe_params = {
    'tfid__stop_words': ['english'],
    'tfid__max_features': [300],
    'tfid__ngram_range': [(1,1)],
    'tfid__min_df': [2]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train);
print(gs.best_score_)
gs.best_params_

0.883935434281322




{'tfid__max_features': 300,
 'tfid__min_df': 2,
 'tfid__ngram_range': (1, 1),
 'tfid__stop_words': 'english'}

In [35]:
gs.score(X_train, y_train)

0.9523443504996156

In [36]:
gs.score(X_test, y_test)

0.9124423963133641

In [37]:
preds = gs.predict(X_test)

In [38]:
cm = confusion_matrix(y_test, 
                 preds)

cm_df = pd.DataFrame(cm, columns = ['Pred Neg', 'Pred Pos'],
            index = ['Actual Neg', 'Actual Pos'])
cm_df

Unnamed: 0,Pred Neg,Pred Pos
Actual Neg,206,25
Actual Pos,13,190


In [39]:
confusion_matrix(y_test, 
                 preds)

array([[206,  25],
       [ 13, 190]])

In [40]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [41]:
acc = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc,4)}')

Accuracy: 0.9124


# Model 2: TfidfVectorizer and MultinomialNB

In [42]:
X = df['title']
y = df['subreddit']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)

In [44]:
pipe2 = Pipeline([('tfid', TfidfVectorizer()),
                ('mb', MultinomialNB(alpha = .01))
                ])

In [45]:
pipe2_params = {
    'tfid__stop_words': ['english'],
    'tfid__max_features': [400],
    'tfid__ngram_range': [(1,1)],
    'tfid__min_df': [2]
}
gs2 = GridSearchCV(pipe2, param_grid=pipe2_params, cv=3)
gs2.fit(X_train, y_train);
print(gs2.best_score_)
gs2.best_params_

0.846272098385857


{'tfid__max_features': 400,
 'tfid__min_df': 2,
 'tfid__ngram_range': (1, 1),
 'tfid__stop_words': 'english'}

In [46]:
gs2.score(X_train, y_train)

0.9039200614911607

In [47]:
gs2.score(X_test, y_test)

0.9055299539170507

In [48]:
preds2 = gs2.predict(X_test)

In [49]:
confusion_matrix(y_test, 
                 preds2)

array([[223,   8],
       [ 33, 170]])

In [50]:
tn, fp, fn, tp = confusion_matrix(y_test, preds2).ravel()

In [53]:
cm2 = confusion_matrix(y_test, 
                 preds2)

cm_df2 = pd.DataFrame(cm2, columns = ['Pred Neg', 'Pred Pos'],
            index = ['Actual Neg', 'Actual Pos'])
cm_df2

Unnamed: 0,Pred Neg,Pred Pos
Actual Neg,223,8
Actual Pos,33,170


In [52]:
acc2 = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc2,4)}')

Accuracy: 0.9055


In [54]:
spec2 = tn / (tn + fp)

print(f'Specificity: {round(spec2,4)}')

Specificity: 0.9654


In [55]:
sens2 = tp / (tp + fn)

print(f'Sensitivity: {round(sens2,4)}')

Sensitivity: 0.8374


# Comparing CountVectorizer with TfidVectorizer with MultinomialNB

In [56]:
pipe3 = Pipeline([('cvec', CountVectorizer()),
                 ('mb', MultinomialNB(alpha = .01))
                ])

In [57]:
pipe3_params = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [200],
    'cvec__ngram_range': [(1,1)],
    'cvec__min_df': [2, 3]
}
gs3 = GridSearchCV(pipe3, param_grid=pipe3_params, cv=3)
gs3.fit(X_train, y_train);
print(gs3.best_score_)
gs3.best_params_

0.8316679477325134


{'cvec__max_features': 200,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [58]:
gs3.score(X_train, y_train)

0.8585703305149884

In [59]:
gs3.score(X_test, y_test)

0.8870967741935484

In [60]:
preds3 = gs3.predict(X_test)

In [61]:
confusion_matrix(y_test, 
                 preds3)

array([[221,  10],
       [ 39, 164]])

In [62]:
tn, fp, fn, tp = confusion_matrix(y_test, preds3).ravel()

In [63]:
cm3 = confusion_matrix(y_test, 
                 preds3)

cm_df3 = pd.DataFrame(cm3, columns = ['Pred Neg', 'Pred Pos'],
            index = ['Actual Neg', 'Actual Pos'])
cm_df3

Unnamed: 0,Pred Neg,Pred Pos
Actual Neg,221,10
Actual Pos,39,164


In [64]:
acc3 = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc3,4)}')

Accuracy: 0.8871


### TfidVectorizer performed better than CountVectorizer in Model 2

# Trying out decision trees

In [65]:
pipe4 = Pipeline([('cvec', CountVectorizer()),
                 ('dtr', DecisionTreeClassifier())
                ])

In [66]:
pipe4_params = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [200],
    'cvec__ngram_range': [(1,1)],
    'cvec__min_df': [2, 3]
}
gs4 = GridSearchCV(pipe4, param_grid=pipe4_params, cv=3)
gs4.fit(X_train, y_train);
print(gs4.best_score_)
gs4.best_params_

0.8293620292083013


{'cvec__max_features': 200,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [67]:
gs4.score(X_train, y_train)

0.9431206764027671

In [68]:
gs4.score(X_test, y_test)

0.8847926267281107

In [69]:
preds4 = gs4.predict(X_test)

In [70]:
confusion_matrix(y_test, 
                 preds4)

array([[196,  35],
       [ 15, 188]])

In [71]:
tn, fp, fn, tp = confusion_matrix(y_test, preds4).ravel()

In [72]:
acc4 = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc4,4)}')

Accuracy: 0.8848


# Trying TfidfVectorizer with DecisionTrees

In [73]:
pipe5 = Pipeline([('tfid', TfidfVectorizer()),
                 ('dtr', DecisionTreeClassifier())
                ])

In [74]:
pipe5_params = {
    'tfid__stop_words': ['english'],
    'tfid__max_features': [400],
    'tfid__ngram_range': [(1,1)],
    'tfid__min_df': [2]
}
gs5 = GridSearchCV(pipe5, param_grid=pipe5_params, cv=3)
gs5.fit(X_train, y_train);
print(gs5.best_score_)
gs5.best_params_

0.8255188316679477


{'tfid__max_features': 400,
 'tfid__min_df': 2,
 'tfid__ngram_range': (1, 1),
 'tfid__stop_words': 'english'}

In [75]:
gs5.score(X_train, y_train)

0.9754035357417371

In [76]:
gs5.score(X_test, y_test)

0.8732718894009217

In [77]:
preds5 = gs5.predict(X_test)

In [78]:
confusion_matrix(y_test, 
                 preds5)

array([[191,  40],
       [ 15, 188]])

In [79]:
tn, fp, fn, tp = confusion_matrix(y_test, preds5).ravel()

In [80]:
acc4 = (tp + tn) / (tp + fp + fn + tn)

print(f'Accuracy: {round(acc4,4)}')

Accuracy: 0.8733


### Both perform worst than the other 2 models and overfit. 