# Project 3: Reddit API Classification & Natural Language Processing

## Tom Ludlow, DSI-NY-6

Using NLP to identify posts from **r/audioengineering** and **r/livesound**

## Notebook 4: Model Optimization

In [1]:
# library imports
import requests
import time
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm

# preprocessing imports
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# modeling imports
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# random state var
r = 1220

In [3]:
X_train = pd.read_csv('./csv/181220_X_train.csv', index_col=0)
X_test = pd.read_csv('./csv/181220_X_test.csv', index_col=0)
y_train = pd.read_csv('./csv/181220_y_train.csv', index_col=0)
y_test = pd.read_csv('./csv/181220_y_test.csv', index_col=0)

X_train_post = X_train['post_lm']
X_test_post = X_test['post_lm']

In [4]:
X_train.head()

Unnamed: 0,post_st,post_lm
575,look for book hi i'm look for book or document...,looking for book hi i'm looking for book or do...
584,where to appli i recent finish a record and pr...,where to apply i recently finished a recording...
1187,monitor tip gener work with a pro2 hey guy i k...,monitor tip generally work with a pro2 hey guy...
1569,what' the dumbest thing you'v ever heard on a ...,what's the dumbest thing you've ever heard on ...
1135,mix classic for the first time next week what ...,mixing classical for the first time next week ...


### Model Selections: 
#### 1. Lemmatized CountVectorizer Multinomial Naive-Bayes
  - `cv__ngram_range=(1,2)`
  - `cv__stop_words='english'`
  
#### 2. Lemmatized CountVectorizer Random Forest
*(project requirement)*
  - `cv__ngram_range=(1,1)`
  - `cv__stop_words='english'`
  
#### 3. Lemmatized CountVectorizer Gradient-Boost Decision Tree
  - `cv__ngram_range=(1,2)`
  - `cv__stop_words='english'`
  
#### 4. Lemmatized TF-IDF Scaled Logistic Regression
  - `cv__ngram_range=(1,2)`
  - `cv__stop_words='english'`

## Iterative Approach

For each model, we have set up a `runs` DataFrame to store the parameters and results of each GridSearch.  The GridSearch is set to a `random_state` value, so that cross validation selection will be consistent between runs, and we will be able to make direct comparisons over effectiveness of hyperparameters.

We start with a wide range for fields of interest, and narrow around the optimally selected value and gauge the degree of accuracy increase (or decrease).  Through trial and error, we are able to select hyperparameters that will promote the most accurate modeling results.

#### Instantiate DataFrame to store results of each run

In [5]:
mnb_runs = pd.DataFrame(columns=['train_accuracy','test_accuracy','bp','tn','fp','fn','tp'])

## Multinomial Naive-Bayes

In [145]:
mnb_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.992992,0.829832,"{'cv__max_features': None, 'mnb__alpha': 1.5}",196,35,46,199
1,0.987386,0.834034,"{'cv__max_features': 35000, 'mnb__alpha': 1.2}",193,38,41,204


In [125]:
r = 1220 # random_state variable for consistency

In [146]:
# parameters for GridSearch using Pipeline, formatted to call named estimators
mnb_params = {"mnb__alpha":np.arange(1,1.5,.1), "cv__max_features":[32500,35000,37500]}

# steps defining pipeline sequence and fixed parameters for GridSearch
mnb_steps = [('cv',CountVectorizer(stop_words='english', ngram_range=(1,2))),
            ('mnb',MultinomialNB())]

In [147]:
# establish model pipeline by reference to steps list
pipe = Pipeline(mnb_steps)

In [148]:
mnb_post_results = {} # empty dict to store results

grid = GridSearchCV(pipe, mnb_params, cv=5) # optimize GridSearch hyperparameters on `cv=5` cross validation runs
grid.fit(X_train_post, y_train.is_ls) # fit to our training data

print('Train Accuracy: ',grid.score(X_train_post, y_train.is_ls))
mnb_post_results['train_accuracy'] = grid.score(X_train_post, y_train.is_ls) # print/store training accuracy

print('Test Accuracy: ',grid.score(X_test_post, y_test.is_ls))
mnb_post_results['test_accuracy'] = grid.score(X_test_post, y_test.is_ls) # print/store test accuracy

print('BP: ',grid.best_params_)
mnb_post_results['bp'] = grid.best_params_ # print/store best parameters

tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test_post)).ravel() # inspect counted results in matrix
print("True Negatives: %s" % tn)
mnb_post_results['tn'] = tn
print("False Positives: %s" % fp)
mnb_post_results['fp'] = fp
print("False Negatives: %s" % fn)
mnb_post_results['fn'] = fn
print("True Positives: %s" % tp)
mnb_post_results['tp'] = tp

Train Accuracy:  0.9880868955851436
Test Accuracy:  0.8298319327731093
BP:  {'cv__max_features': 35000, 'mnb__alpha': 1.1}
True Negatives: 192
False Positives: 39
False Negatives: 42
True Positives: 203


#### Result Metrics: Multinomial Naive-Bayes

In [149]:
# accuracy
(mnb_post_results['tn'] + mnb_post_results['tp']) \
    / (mnb_post_results['tn'] + mnb_post_results['fp'] \
       + mnb_post_results['fn'] + mnb_post_results['tp'])

0.8298319327731093

In [150]:
# sensitivity
mnb_post_results['tp'] / (mnb_post_results['tp'] + mnb_post_results['fn'])

0.8285714285714286

In [151]:
# specificity
mnb_post_results['tn'] / (mnb_post_results['tn'] + mnb_post_results['fp'])

0.8311688311688312

In [152]:
# precision
mnb_post_results['tp'] / (mnb_post_results['tp'] + mnb_post_results['fp'])

0.8388429752066116

#### Save to run DataFrame

In [153]:
mnb_runs = mnb_runs.append(mnb_post_results, ignore_index=True)

In [154]:
mnb_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.992992,0.829832,"{'cv__max_features': None, 'mnb__alpha': 1.5}",196,35,46,199
1,0.987386,0.834034,"{'cv__max_features': 35000, 'mnb__alpha': 1.2}",193,38,41,204
2,0.988087,0.829832,"{'cv__max_features': 35000, 'mnb__alpha': 1.1}",192,39,42,203


If we are not yet satisfied with our accuracy and confusion matrix results, we return to the `mnb_params` ranges and modify to narrow or broaden one or more hyperparameters.  Any we are satisfied with can be set to our single desired hyperparameter, which will decrease our GridSearch execution time.

## Random Forest

In [155]:
rf_runs = pd.DataFrame(columns=['train_accuracy','test_accuracy','bp','tn','fp','fn','tp'])

#### Iterate from here to retain stored run data

In [187]:
rf_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.844429,0.77521,"{'cv__max_features': None, 'rf__criterion': 'g...",137,94,13,232
1,0.858444,0.77521,"{'cv__max_features': None, 'rf__criterion': 'g...",139,92,15,230
2,0.866854,0.796218,"{'cv__max_features': None, 'rf__criterion': 'g...",146,85,12,233


In [178]:
rf_params = {"rf__n_estimators":np.arange(98,102,1), "rf__max_depth": [7,8,9], 
             "rf__criterion":['gini',], "cv__max_features":[None,35000,40000,45000]}
rf_steps = [('cv',CountVectorizer(stop_words='english', ngram_range=(1,1))),
             ('rf',RandomForestClassifier(random_state=r))]

In [179]:
pipe = Pipeline(rf_steps)

In [180]:
rf_post_results = {}

grid = GridSearchCV(pipe, rf_params, cv=5)
grid.fit(X_train_post, y_train.is_ls)

print('Train Accuracy: ',grid.score(X_train_post, y_train.is_ls))
rf_post_results['train_accuracy'] = grid.score(X_train_post, y_train.is_ls)

print('Test Accuracy: ',grid.score(X_test_post, y_test.is_ls))
rf_post_results['test_accuracy'] = grid.score(X_test_post, y_test.is_ls)

print('BP: ',grid.best_params_)
rf_post_results['bp'] = grid.best_params_

tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test_post)).ravel()
print("True Negatives: %s" % tn)
rf_post_results['tn'] = tn
print("False Positives: %s" % fp)
rf_post_results['fp'] = fp
print("False Negatives: %s" % fn)
rf_post_results['fn'] = fn
print("True Positives: %s" % tp)
rf_post_results['tp'] = tp

Train Accuracy:  0.866853538892782
Test Accuracy:  0.7962184873949579
BP:  {'cv__max_features': None, 'rf__criterion': 'gini', 'rf__max_depth': 9, 'rf__n_estimators': 99}
True Negatives: 146
False Positives: 85
False Negatives: 12
True Positives: 233


#### Result Metrics: Random Forest

In [181]:
# accuracy
(rf_post_results['tn'] + rf_post_results['tp']) \
    / (rf_post_results['tn'] + rf_post_results['fp'] \
       + rf_post_results['fn'] + rf_post_results['tp'])

0.7962184873949579

In [182]:
# sensitivity
rf_post_results['tp'] / (rf_post_results['tp'] + rf_post_results['fn'])

0.9510204081632653

In [183]:
# specificity
rf_post_results['tn'] / (rf_post_results['tn'] + rf_post_results['fp'])

0.6320346320346321

In [184]:
# precision
rf_post_results['tp'] / (rf_post_results['tp'] + rf_post_results['fp'])

0.7327044025157232

#### Save run to DataFrame

In [185]:
rf_runs = rf_runs.append(rf_post_results, ignore_index=True)

In [186]:
rf_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.844429,0.77521,"{'cv__max_features': None, 'rf__criterion': 'g...",137,94,13,232
1,0.858444,0.77521,"{'cv__max_features': None, 'rf__criterion': 'g...",139,92,15,230
2,0.866854,0.796218,"{'cv__max_features': None, 'rf__criterion': 'g...",146,85,12,233


## Gradient-Boost Decision Tree

In [188]:
gb_runs = pd.DataFrame(columns=['train_accuracy','test_accuracy','bp','tn','fp','fn','tp'])

#### Iterate from here to retain stored run data

In [211]:
gb_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213
1,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213


In [219]:
gb_params = {"gb__loss":['deviance'], "gb__n_estimators": np.arange(97,103,1), 
             "gb__max_depth":np.arange(5,9,1), "cv__max_features":[None]}
gb_steps = [('cv',CountVectorizer(stop_words='english', ngram_range=(1,2))),
             ('gb',GradientBoostingClassifier(random_state=r))]

In [220]:
pipe = Pipeline(gb_steps)

In [221]:
gb_post_results = {}

grid = GridSearchCV(pipe, gb_params, cv=3) # set cross validation count to 3 to reduce processing time
grid.fit(X_train_post, y_train.is_ls)

print('Train Accuracy: ',grid.score(X_train_post, y_train.is_ls))
gb_post_results['train_accuracy'] = grid.score(X_train_post, y_train.is_ls)

print('Test Accuracy: ',grid.score(X_test_post, y_test.is_ls))
gb_post_results['test_accuracy'] = grid.score(X_test_post, y_test.is_ls)

print('BP: ',grid.best_params_)
gb_post_results['bp'] = grid.best_params_

tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test_post)).ravel()
print("True Negatives: %s" % tn)
gb_post_results['tn'] = tn
print("False Positives: %s" % fp)
gb_post_results['fp'] = fp
print("False Negatives: %s" % fn)
gb_post_results['fn'] = fn
print("True Positives: %s" % tp)
gb_post_results['tp'] = tp

Train Accuracy:  0.9985984583041345
Test Accuracy:  0.8004201680672269
BP:  {'cv__max_features': None, 'gb__loss': 'deviance', 'gb__max_depth': 7, 'gb__n_estimators': 100}
True Negatives: 168
False Positives: 63
False Negatives: 32
True Positives: 213


#### Result Metrics: Gradient-Boost Decision Tree

In [222]:
# accuracy
(gb_post_results['tn'] + gb_post_results['tp']) \
    / (gb_post_results['tn'] + gb_post_results['fp'] \
       + gb_post_results['fn'] + gb_post_results['tp'])

0.8004201680672269

In [223]:
# sensitivity
gb_post_results['tp'] / (gb_post_results['tp'] + gb_post_results['fn'])

0.8693877551020408

In [224]:
# specificity
gb_post_results['tn'] / (gb_post_results['tn'] + gb_post_results['fp'])

0.7272727272727273

In [225]:
# precision
gb_post_results['tp'] / (gb_post_results['tp'] + gb_post_results['fp'])

0.7717391304347826

#### Save run to DataFrame

In [226]:
gb_runs = gb_runs.append(gb_post_results, ignore_index=True)

In [227]:
gb_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213
1,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213
2,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213
3,0.998598,0.80042,"{'cv__max_features': None, 'gb__loss': 'devian...",168,63,32,213


## TF-IDF Logistic Regression

In [6]:
lr_runs = pd.DataFrame(columns=['train_accuracy','test_accuracy','bp','tn','fp','fn','tp'])

#### Iterate from here to retain stored run data

In [7]:
lr_params = {"lr__penalty":['l1'], "lr__C": [1.2],
             "lr__tol":[.00035], "tf__max_features":[25000,27500,30000,32500,35000]}
lr_steps = [('tf',TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
            ('sc',StandardScaler(with_mean=False)),
            ('lr',LogisticRegression(random_state=r))]

In [8]:
pipe = Pipeline(lr_steps)

In [9]:
lr_post_results = {}

grid = GridSearchCV(pipe, lr_params, cv=3)
grid.fit(X_train_post, y_train.is_ls)

print('Train Accuracy: ',grid.score(X_train_post, y_train.is_ls))
lr_post_results['train_accuracy'] = grid.score(X_train_post, y_train.is_ls)

print('Test Accuracy: ',grid.score(X_test_post, y_test.is_ls))
lr_post_results['test_accuracy'] = grid.score(X_test_post, y_test.is_ls)

print('BP: ',grid.best_params_)
lr_post_results['bp'] = grid.best_params_

tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test_post)).ravel()
print("True Negatives: %s" % tn)
lr_post_results['tn'] = tn
print("False Positives: %s" % fp)
lr_post_results['fp'] = fp
print("False Negatives: %s" % fn)
lr_post_results['fn'] = fn
print("True Positives: %s" % tp)
lr_post_results['tp'] = tp

Train Accuracy:  0.999299229152
Test Accuracy:  0.806722689076
BP:  {'lr__C': 1.2, 'lr__penalty': 'l1', 'lr__tol': 0.00035, 'tf__max_features': 32500}
True Negatives: 193
False Positives: 38
False Negatives: 54
True Positives: 191


#### Result Metrics: TF-IDF Logistic Regression

In [10]:
# accuracy
(lr_post_results['tn'] + lr_post_results['tp']) \
    / (lr_post_results['tn'] + lr_post_results['fp'] \
       + lr_post_results['fn'] + lr_post_results['tp'])

0.80672268907563027

In [11]:
# sensitivity
lr_post_results['tp'] / (lr_post_results['tp'] + lr_post_results['fn'])

0.7795918367346939

In [12]:
# specificity
lr_post_results['tn'] / (lr_post_results['tn'] + lr_post_results['fp'])

0.83549783549783552

In [13]:
# precision
lr_post_results['tp'] / (lr_post_results['tp'] + lr_post_results['fp'])

0.83406113537117899

#### Save run to DataFrame

In [75]:
lr_runs = lr_runs.append(lr_post_results, ignore_index=True)

In [76]:
lr_runs.head()

Unnamed: 0,train_accuracy,test_accuracy,bp,tn,fp,fn,tp
0,0.999299,0.821429,"{'lr__C': 1, 'lr__penalty': 'l1', 'lr__tol': 0...",195,36,49,196
1,0.999299,0.804622,"{'lr__C': 1, 'lr__penalty': 'l1', 'lr__tol': 0...",190,41,52,193
2,0.999299,0.789916,"{'lr__C': 1.1, 'lr__penalty': 'l1', 'lr__tol':...",186,45,55,190
3,0.999299,0.789916,"{'lr__C': 1.1, 'lr__penalty': 'l1', 'lr__tol':...",186,45,55,190
4,0.999299,0.804622,"{'lr__C': 1.2000000000000002, 'lr__penalty': '...",192,39,54,191


In [78]:
lr_runs.loc[0]['bp']

{'lr__C': 1, 'lr__penalty': 'l1', 'lr__tol': 0.001, 'tf__max_features': 30000}

## Optimized Model Features
 
**Model 1:** Multinomial Naive-Bayes
 - *Lemmatizer*
 - *CountVectorizer*
  - `stop_words='english'`
  - `ngram_range=(1,2)`
 - *GridSearch*
  - `cv__max_features=35000`
  - `mnb__alpha=1.2`
 
**Model 2:** Random Forest
 - *Lemmatizer*
 - *CountVectorizer*
  - `stop_words='english'`
  - `ngram_range=(1,1)`
 - *GridSearch*
  - `cv__max_features=None`
  - `rf__criterion='gini'`
  - `rf__n_estimators=99`
  - `rf__max_depth=9`
  - `rf__max_features='sqrt'`
  
**Model 3:** Gradient-Boost Decision Tree
 - *Lemmatizer*
 - *CountVectorizer*
  - `stop_words='english'`
  - `ngram_range=(1,2)`
 - *GridSearch*
  - `cv__max_features=None`
  - `gb__loss='deviance'`
  - `gb__max_depth=7`
  - `gb__n_estimators=100`
  
**Model 4:** TF-IDF Logistic Regression
 - *Lemmatizer*
 - *TfidfVectorizer*
  - `stop_words='english'`
  - `ngram_range=(1,2)`
 - *GridSearch*
  - `tf__max_features=30000`
  - `lr__penalty='l1'`
  - `lr__C=1`
  - `lr__tol=.001`

With our model hyperparameters optimized through GridSearch, we build each desired model pipeline.

# Model 1 Optimized: Multinomial Naive-Bayes

In [228]:
m1_steps = [('m1_cv',CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=35000)),
           ('m1_mnb',MultinomialNB(alpha=1.2))]

In [229]:
pipe_1 = Pipeline(m1_steps)

In [230]:
pipe_1.fit(X_train.post_lm, y_train.is_ls)

Pipeline(memory=None,
     steps=[('m1_cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=35000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('m1_mnb', MultinomialNB(alpha=1.2, class_prior=None, fit_prior=True))])

In [231]:
pipe_1.score(X_train.post_lm, y_train.is_ls)

0.9873861247372109

In [232]:
pipe_1.score(X_test.post_lm, y_test.is_ls)

0.8340336134453782

In [233]:
tn, fp, fn, tp = confusion_matrix(y_test.is_ls, pipe_1.predict(X_test.post_lm)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))

True Negatives: 193
False Positives: 38
False Negatives: 41
True Positives: 204

Accuracy:  0.8340336134453782
Sensitivity:  0.8326530612244898
Specificity:  0.8354978354978355
Precision:  0.8429752066115702


# Model 2 Optimized: Random Forest

In [234]:
m2_steps = [('m2_cv',CountVectorizer(stop_words='english', ngram_range=(1,1))),
           ('m2_rf',RandomForestClassifier(criterion='gini', n_estimators=99, max_depth=9))]

In [235]:
pipe_2 = Pipeline(m2_steps)

In [236]:
pipe_2.fit(X_train.post_lm, y_train.is_ls)

Pipeline(memory=None,
     steps=[('m2_cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
       ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [237]:
pipe_2.score(X_train.post_lm, y_train.is_ls)

0.8703573931324456

In [238]:
pipe_2.score(X_test.post_lm, y_test.is_ls)

0.7689075630252101

In [239]:
tn, fp, fn, tp = confusion_matrix(y_test.is_ls, pipe_2.predict(X_test.post_lm)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))

True Negatives: 140
False Positives: 91
False Negatives: 19
True Positives: 226

Accuracy:  0.7689075630252101
Sensitivity:  0.9224489795918367
Specificity:  0.6060606060606061
Precision:  0.7129337539432177


# Model 3 Optimized: Gradient-Boost Decision Tree

In [240]:
m3_steps = [('m3_cv',CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=None)),
           ('m3_gb',GradientBoostingClassifier(loss='deviance', n_estimators=100, max_depth=7))]

In [241]:
pipe_3 = Pipeline(m3_steps)

In [242]:
pipe_3.fit(X_train.post_lm, y_train.is_ls)

Pipeline(memory=None,
     steps=[('m3_cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
       ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [243]:
pipe_3.score(X_train.post_lm, y_train.is_ls)

0.9985984583041345

In [244]:
pipe_3.score(X_test.post_lm, y_test.is_ls)

0.7878151260504201

In [245]:
tn, fp, fn, tp = confusion_matrix(y_test.is_ls, pipe_3.predict(X_test.post_lm)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))

True Negatives: 163
False Positives: 68
False Negatives: 33
True Positives: 212

Accuracy:  0.7878151260504201
Sensitivity:  0.8653061224489796
Specificity:  0.7056277056277056
Precision:  0.7571428571428571


# Model 4 Optimized: TF-IDF Logistic Regression

In [14]:
m4_steps = [('tf',TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=30000)),
            ('sc',StandardScaler(with_mean=False)),
            ('lr',LogisticRegression(penalty='l1', C=1, tol=.001))]

In [15]:
pipe_4 = Pipeline(m4_steps)

In [16]:
pipe_4.fit(X_train.post_lm, y_train.is_ls)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ... penalty='l1', random_state=None, solver='warn',
          tol=0.001, verbose=0, warm_start=False))])

In [17]:
pipe_4.score(X_train.post_lm, y_train.is_ls)

0.99929922915206726

In [18]:
pipe_4.score(X_test.post_lm, y_test.is_ls)

0.80042016806722693

In [19]:
tn, fp, fn, tp = confusion_matrix(y_test.is_ls, pipe_4.predict(X_test.post_lm)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))

True Negatives: 190
False Positives: 41
False Negatives: 54
True Positives: 191

Accuracy:  0.800420168067
Sensitivity:  0.779591836735
Specificity:  0.822510822511
Precision:  0.823275862069


## Continue to Notebook 5: Model Evaluation