# Imports and loads

In [2]:
import pickle
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, SCORERS
from sklearn.model_selection import train_test_split, GridSearchCV
from tools import save_model, predict_one, predict_many, report_score

In [2]:
def save_model(model,name):
    file_ext= '.sav'
    path = 'models/'
    pickle.dump(model, open(path+name+file_ext, 'wb'))
    
def predict_one(string, model_name, vectorizor_name):
    
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform([string])

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

def predict_many(review_list, model_name, vectorizor_name):
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform(review_list)

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

###### Load things stored from other pipes

In [21]:
arr  = np.load('data/english_arr.npy', allow_pickle = True)

In [3]:
ngr  = np.load('data/2grams.npy', allow_pickle = True)

# Vectorize and Stem, or: Pre-handling

In [22]:
lang = arr[:,3]
ratings = arr[:,0]


In [23]:
# good vs bad rating 4,5 will be good (1), -1, 1,2,3 will be bad (0)

y = ratings
gvb = []

for i in y:
    if i <4:
        gvb.append(0)
    else:
        gvb.append(1)
        

np.save('gvbtrain.npy', gvb)

In [17]:
test_y = [1,2,3,4,5,5,4,3,2]
gvbt = []

for i in test_y:
    if i < 4:
        gvbt.append(0)
    else:
        gvbt.append(1)

In [18]:
gvbt

[0, 0, 0, 1, 1, 1, 1, 0, 0]

## Sub-Selections
gvb is a training set that will let the model predict good/bad valences, if a rating is 4 or 5, gvb = 1

FourFive is a sub-selection of the whole dataset with just the 4 and five ratings, to try and train a this is best model


In [9]:
tfid = TfidfVectorizer(stop_words ='english', lowercase = False, max_features = 5000)
tfidfed = tfid.fit_transform(lang)

X = tfidfed
y = ratings.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

In [4]:
#sngr = [' '.join(item) for item in ngr] # <- Run only once each notebook


ngramTFV5k = TfidfVectorizer(lowercase = False, max_features = 5000)
ngramX = ngramTFV5k.fit_transform(sngr)

In [5]:
np.save('ngramTF5kneg', ngramTFV5k)


In [6]:
gvb  = np.load('gvbtrain.npy', allow_pickle = True)

#X_train, X_test, y_train, y_test = train_test_split(ngramX, gvb, test_size=0.33)

# Various Models

In [15]:

knn_class = KNeighborsClassifier(n_neighbors = 18, n_jobs =-1)
knn_class.fit(X_train, y_train)
knn_class.score(X_test, y_test)

0.6763739203546587

#### An early random forest!

In [36]:
forest1 = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 100)
forest1.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
forest1.oob_score_

0.8460296640096862

In [None]:
save_model(tfid, 'tf84')

Random forest with more trees, mowed down.

In [33]:
manysaplings = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
manysaplings.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [34]:
manysaplings.oob_score_

0.8533699929371406

## Random forest on Ngrams

In [24]:

ngramforest_forest = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
ngramforest_forest.fit(ngramX, gvb)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [28]:
save_model(ngramforest_forest, 'ngramRF85')

In [26]:
ngramforest_forest.oob_score_


0.8509736656240541

Training score: 0.9978916456458717, Testing score: 0.8330658105939005

- tn       fp   fn   tp
- 8072 280 1904 2827

precision: 0.9098809140650145 recall: 0.5975480870851828

In [41]:
report_score(ngramforest_fortts, X_test, y_test)

Training score: 0.9978916456458717, Testing score: 0.8330658105939005
tn   fp   fn   tp
8072 280 1904 2827
precision: 0.9098809140650145 recall: 0.5975480870851828


[0.9978916456458717,
 0.8330658105939005,
 0.9098809140650145,
 0.5975480870851828]

### Random Forest with vaders appended to vectorized space

In [11]:
from scipy.sparse import hstack

In [10]:
vaders_arr = np.load('data/training_vaders.npy', allow_pickle = True)


In [13]:
ngramTFV10k = TfidfVectorizer(lowercase = False, max_features = 10000)
ngramX = ngramTFV10k.fit_transform(sngr)
vadersX = hstack([vaders_arr, ngramX])

In [42]:
vaders_forest = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
vaders_forest.fit(vadersX, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [15]:
gvb  = np.load('gvbtrain.npy', allow_pickle = True)

X_train, X_test, y_train, y_test = train_test_split(vadersX, gvb, test_size=0.33)
vaders_forest.oob_score_

NameError: name 'vaders_forest' is not defined

In [5]:
def report_score(model, X_train, X_test, y_train, y_test):
    '''
    a function that reports the accuracy of the model.
    Attributes:
    models (lst): a list of instansiated models to test
    Returns:
    out array, model name, training score, testing score, precision, recall
    '''
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)
    print('Training score: {}, Testing score: {}'.format(training_score, testing_score))
    tn, fp, fn, tp = confusion_matrix(y_test,model.predict(X_test)).ravel()
    precision = tp/(fp+tp)
    recall = tp/(fn+tp)
    print('tn', '  fp', '  fn', '  tp')
    print(tn, fp, fn, tp)
    print('precision: '+str(precision), 'recall: '+ str(recall))
    out_lst = [training_score, testing_score, precision, recall]
    return out_lst

In [16]:
vaders_forest_tts = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
vaders_forest_tts.fit(X_train, y_train)
report_score(vaders_forest_tts, X_train, X_test, y_train, y_test)

Training score: 0.9983057866797184, Testing score: 0.8350531223725445
tn   fp   fn   tp
8197 263 1895 2728
precision: 0.912069541959211 recall: 0.590093013194895


[0.9983057866797184, 0.8350531223725445, 0.912069541959211, 0.590093013194895]

### Gradient Boosted Regressors

In [64]:
len(gvb)

25754

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

grad_boost = GradientBoostingClassifier()
grad_boost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [37]:
report_score(grad_boost, X_test, y_test)

NameError: name 'grad_boost' is not defined

##### Gradient boosting with the n-grams

In [55]:
X_train, X_test, y_train, y_test = train_test_split(ngramX, gvb, test_size=0.33)

ngram_gboost = GradientBoostingClassifier(max_features = )
ngram_gboost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [56]:
report_score(ngram_gboost, X_test, y_test)

Training score: 0.7490681826738451, Testing score: 0.7307956890621418
tn   fp   fn   tp
8040 410 3112 1521
precision: 0.787674779906784 recall: 0.3282969997841571


[0.7490681826738451, 0.7307956890621418, 0.787674779906784, 0.3282969997841571]

### Gridsearch on ngrams

In [16]:
# ngram_grid = {'max_depth': [10, 20, None],
#                       'max_features': [None],
#                       #'min_samples_split': [],
#                       'min_samples_leaf': [50,200],
#                       'n_estimators':[200, 250, 300]}                        
                           



# gbr_gridsearch = GridSearchCV(GradientBoostingClassifier(),
#                              ngram_grid,
#                              n_jobs=-1,
#                              verbose=True,
#                              scoring='f1')
# gbr_gridsearch.fit(X_train, y_train)

In [39]:
gbr_gridsearch.best_estimator_
#gbr_gridsearch.best_score_
#gbr_gridsearch.score

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=50, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [14]:
dir(gbr_gridsearch)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score',
 'scorer_',
 'scoring',
 '

## For fun

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
report_score(logreg, X_test, y_test)

Training score: 0.8228605850683333, Testing score: 0.7827715355805244
tn   fp   fn   tp
7514 838 2004 2727
precision: 0.7649368863955119 recall: 0.5764109067850349


[0.8228605850683333,
 0.7827715355805244,
 0.7649368863955119,
 0.5764109067850349]

heey...that's...pretty good. lets investigate what logreg thinks is important

In [48]:
logreg_coef =logreg.coef_
dir(logreg)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_predict_proba_lr',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify',
 'tol',
 'verbose',
 'warm_start']

In [60]:
logreg._get_param_names()

['C',
 'class_weight',
 'dual',
 'fit_intercept',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_jobs',
 'penalty',
 'random_state',
 'solver',
 'tol',
 'verbose',
 'warm_start']

# Make Predictions!

- 'rf84.sav' is an 84% accurate random forest model. 
- 'tf84.sav' is the vectorizer formulated to work for the 84% rf, with 5000 rows. Can fit into other things with 500 features. 
- 'ngramRF85.sav' is the ngramed random forest, 85% r1. 

In [18]:
string = "i found this book to be atrocious"

predict_one(string, 'ngramRF85.sav', 'tf84.sav')

array([1])

In [26]:
predict_one(string, 'rf84.sav', 'tf84.sav')

array([1])

In [None]:
predict_many