# 0. Imports

In [7]:
# Main go-to imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import nltk

# Extra From Imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import stop_words
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

%matplotlib inline
pd.set_option('max_colwidth', 99)

# 1. Grab Data!

In [2]:
df = pd.read_csv('datasets/model.csv')

In [3]:
df.head()

Unnamed: 0,created_utc,title,selftext,subreddit_asoiaf,title_and_text
0,1579203996,[Spoilers Main] Wildfire looks like. . .,I noticed today that the description of wildfire sounds oddly familiar to many brands of antifr...,1,[Spoilers Main] Wildfire looks like. . . I noticed today that the description of wildfire sound...
1,1579205857,(Spoilers Extended) The unusual coloring of Tywin's Valyrian steel swords,"On a re-read of Storm of Swords, I was struck by the passage where Tywin shows Tyrion the two n...",1,(Spoilers Extended) The unusual coloring of Tywin's Valyrian steel swords On a re-read of Storm...
2,1579206490,Fancasts for Brienne of Tarth?,[removed],1,Fancasts for Brienne of Tarth? [removed]
3,1579206863,What if Viserys Targaryen returned to Westeros under a peace banner and swore fealty to Robert ...,[removed],1,What if Viserys Targaryen returned to Westeros under a peace banner and swore fealty to Robert ...
4,1579208246,(Spoilers Extended) What if Viserys Targaryen returned to Westeros under a peace banner and swo...,"In this scenario, Viserys is not mad or cruel and this is before he has encountered the Dothrak...",1,(Spoilers Extended) What if Viserys Targaryen returned to Westeros under a peace banner and swo...


# 2. Models Pt I

In [4]:
X = df['title_and_text']
y = df['subreddit_asoiaf']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3, stratify = y)

In [5]:
y_train.value_counts(normalize=True)

1    0.506302
0    0.493698
Name: subreddit_asoiaf, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

1    0.506442
0    0.493558
Name: subreddit_asoiaf, dtype: float64

#### 2a. TFIDVectorizer Transform / Logistic Regression Classification

In [7]:
pipe = Pipeline(
    [
        ('tvec', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ]
)

pipe_params = {
    'tvec__ngram_range': [(1,1), (1,2), (1,3), (2,2)],
    'tvec__stop_words': [None, 'english'],
    'lr__penalty':['l1', 'l2'],
    'lr__C': [10, 1000, 1e9]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [8]:
results = gs.fit(X_train, y_train)









In [9]:
results.best_params_

{'lr__C': 1000000000.0,
 'lr__penalty': 'l2',
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english'}

In [10]:
results.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1000000000.0, class_weight=None,
                                    dual=False, fit_intercept=True,
     

In [11]:
# Get score
train_score = results.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score))
test_score = results.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score))

Best TRAIN accuracy: 0.9894
Best TEST set accuracy: 0.9874


#### 2b. CountVectorizer Transform / Logistic Regression Classification

In [12]:
pipe2 = Pipeline(
    [
        ('cvec', CountVectorizer()),
        ('lr', LogisticRegression(solver='liblinear'))
    ]
)

pipe_params2 = {
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,2)],
    'cvec__stop_words': [None, 'english'],
    'lr__penalty':['l1', 'l2'],
    'lr__C': [10, 1000, 1e9]
}

gs2 = GridSearchCV(pipe2, param_grid=pipe_params2, cv=3, scoring = 'accuracy')

In [13]:
results2 = gs2.fit(X_train, y_train)



In [14]:
results2.best_params_

{'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'lr__C': 10,
 'lr__penalty': 'l2'}

In [15]:
results2.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                       

In [16]:
# Get score
train_score2 = results2.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score2))
test_score2 = results2.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score2))

Best TRAIN accuracy: 0.9790
Best TEST set accuracy: 0.9815


#### 2c. CountVectorizer Transform / $k$-NN Classification

In [17]:
pipe3 = Pipeline(
    [
        ('cvec', CountVectorizer()),
        ('ss', StandardScaler(with_mean=False)),
        ('knn', KNeighborsClassifier())
    ]
)

pipe_params3 = {
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,2)],
    'cvec__stop_words': [None, 'english'],    
    'knn__n_neighbors': [3, 5, 7, 9, 15],
    'knn__leaf_size': [15, 30, 45, 100]
}

gs3 = GridSearchCV(pipe3, param_grid=pipe_params3, cv=3, scoring = 'accuracy')

In [18]:
results3 = gs3.fit(X_train, y_train)

In [19]:
results3.best_params_

{'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__leaf_size': 15,
 'knn__n_neighbors': 3}

In [20]:
results3.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('ss',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=15,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neigh

In [21]:
# Get score
train_score3 = results3.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score3))
test_score3 = results3.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score3))

Best TRAIN accuracy: 0.5981
Best TEST set accuracy: 0.6211


#### 2d. TFIDF Transform / $k$-NN Classification

In [22]:
pipe4 = Pipeline(
    [
        ('tvec', TfidfVectorizer()),
        ('ss', StandardScaler(with_mean=False)),
        ('knn', KNeighborsClassifier())
    ]
)

pipe_params4 = {
    'tvec__ngram_range': [(1,1), (1,2), (1,3), (2,2)],
    'tvec__stop_words': [None, 'english'],   
    'knn__n_neighbors': [3, 5, 7, 9, 15],
    'knn__leaf_size': [15, 30, 45, 100]
}

gs4 = GridSearchCV(pipe4, param_grid=pipe_params4, cv=3, scoring = 'accuracy')

In [23]:
results4 = gs4.fit(X_train, y_train)

In [24]:
results4.best_params_

{'knn__leaf_size': 15,
 'knn__n_neighbors': 3,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [25]:
results4.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('ss',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm=

In [27]:
# Get score
train_score4 = results4.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score4))
test_score4 = results4.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score4))

Best TRAIN accuracy: 0.5111
Best TEST set accuracy: 0.5137


In [32]:
y_preds = gs.predict(X_test)

In [33]:
y_preds;

In [35]:
tn, fp, fn, tp = confusion_matrix(y_test,
                                  y_preds).ravel()

print(confusion_matrix(y_test,
                       y_preds))

[[1475   19]
 [  19 1514]]


In [38]:
tn

1475

In [39]:
fp

19

In [37]:
cm = confusion_matrix(y_test, y_preds)
cm_df = pd.DataFrame(cm, columns=['Predicted WoT', 'Predicted asoiaf'], index=['Actual WoT', 'Actual Asoiaf'])
cm_df

Unnamed: 0,Predicted WoT,Predicted asoiaf
Actual WoT,1475,19
Actual Asoiaf,19,1514


In [40]:
sens = tp / (tp + fn)

print(f'Sensitivity: {round(sens, 4)}')

Sensitivity: 0.9876


In [8]:
spec = tn / (tn + fp)

print(f'Specificity: {round(spec, 4)}')

NameError: name 'tn' is not defined

#### 2e. CountVectorizer Transform / Multinomial Naive Classification

In [9]:
pipe5 = Pipeline(
    [
        ('cvec', CountVectorizer()),
        ('mnb', MultinomialNB())
    ]
)

pipe_params5 = {
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,2)],
    'cvec__stop_words': [None, 'english'],
}

gs5 = GridSearchCV(pipe5, param_grid=pipe_params5, cv=3, scoring = 'accuracy')

In [10]:
results5 = gs5.fit(X_train, y_train)

In [11]:
# Get score
train_score5 = results5.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score5))
test_score5 = results5.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score5))

Best TRAIN accuracy: 0.9786
Best TEST set accuracy: 0.9765
