# Imports and loads

In [1]:
import pickle
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, SCORERS
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
def save_model(model,name):
    file_ext= '.sav'
    path = 'models/'
    pickle.dump(model, open(path+name+file_ext, 'wb'))
    
def predict_one(string, model_name, vectorizor_name):
    
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform([string])

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

def predict_many(review_list, model_name, vectorizor_name):
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform(review_list)

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

###### Load things stored from other pipes

In [4]:
arr  = np.load('data/english_arr.npy', allow_pickle = True)

In [3]:
ngr  = np.load('data/2grams.npy', allow_pickle = True)

# Vectorize and Stem, or: Pre-handling

In [14]:
lang = arr[:,3]
ratings = arr[:,0]


In [24]:
# good vs bad rating 4,5 will be good (1), -1, 1,2,3 will be bad (0)

y = ratings
gvb = []

for i in y:
    if i <=4:
        gvb.append(0)
    else:
        gvb.append(1)
        

np.save('gvbtrain.npy', gvb)

In [25]:
len(gvb)

39644

## Sub-Selections
gvb is a training set that will let the model predict good/bad valences, if a rating is 4 or 5, gvb = 1

FourFive is a sub-selection of the whole dataset with just the 4 and five ratings, to try and train a this is best model


In [32]:
tfid = TfidfVectorizer(stop_words ='english', lowercase = False, max_features = 5000)
tfidfed = tfid.fit_transform(lang)

X = tfidfed
y = ratings.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

In [6]:
sngr = [' '.join(item) for item in ngr]
    

In [7]:
ngramTFV = TfidfVectorizer(lowercase = False, max_features = 5000)
ngramX = ngramTFV.fit_transform(sngr)




In [8]:
gvb  = np.load('gvbtrain.npy', allow_pickle = True)

X_train, X_test, y_train, y_test = train_test_split(ngramX, gvb, test_size=0.33)

# Various Models

In [19]:

knn_class = KNeighborsClassifier(n_neighbors = 18, n_jobs =-1)
knn_class.fit(X_train, y_train)
knn_class.score(X_test, y_test)

0.6639914392723382

#### An early random forest!

In [36]:
forest1 = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 100)
forest1.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
forest1.oob_score_

0.8460296640096862

In [None]:
save_model(tfid, 'tf84')

Random forest with more trees, mowed down.

In [33]:
manysaplings = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
manysaplings.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [34]:
manysaplings.oob_score_

0.8533699929371406

Random forest on Ngrams

In [29]:
ngramforest = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
ngramforest.fit(ngramX, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [30]:
ngramforest.oob_score_

0.8543537483604077

### Gradient Boosted Regressors

In [64]:
len(gvb)

25754

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

grad_boost = GradientBoostingClassifier()
grad_boost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [50]:
report_score(grad_boost, X_test, y_test)

Training score: 0.7440984902676857, Testing score: 0.718183902774593
tn   fp   fn   tp
8025 383 3304 1371
precision: 0.7816419612314709 recall: 0.2932620320855615


[GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='auto',
                            random_state=None, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 0.7440984902676857,
 0.718183902774593,
 0.7816419612314709,
 0.2932620320855615]

##### Gradient boosting with the n-grams

In [55]:
X_train, X_test, y_train, y_test = train_test_split(ngramX, gvb, test_size=0.33)

ngram_gboost = GradientBoostingClassifier(max_features = )
ngram_gboost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [56]:
report_score(ngram_gboost, X_test, y_test)

Training score: 0.7490681826738451, Testing score: 0.7307956890621418
tn   fp   fn   tp
8040 410 3112 1521
precision: 0.787674779906784 recall: 0.3282969997841571


[0.7490681826738451, 0.7307956890621418, 0.787674779906784, 0.3282969997841571]

### Gridsearch on ngrams

In [None]:
ngram_grid = {'max_depth': [1,2, 3, 4, 10],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [2, 4, 10],
                      'min_samples_leaf': [1, 2, 4],
                      'n_estimators':[10,50,100,200]}                        
                           



gbr_gridsearch = GridSearchCV(GradientBoostingClassifier(),
                             ngram_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='f1')
gbr_gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 540 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 51.1min


In [53]:
def report_score(model, X_test, y_test):
    '''
    a function that reports the accuracy of the model.
    Attributes:
    models (lst): a list of instansiated models to test
    Returns:
    out array, model name, training score, testing score, precision, recall
    '''
    
    
    

    
    
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)
    print('Training score: {}, Testing score: {}'.format(training_score, testing_score))
    tn, fp, fn, tp = confusion_matrix(y_test,model.predict(X_test)).ravel()
    precision = tp/(fp+tp)
    recall = tp/(fn+tp)
    print('tn', '  fp', '  fn', '  tp')
    print(tn, fp, fn, tp)
    print('precision: '+str(precision), 'recall: '+ str(recall))
    out_lst = [training_score, testing_score, precision, recall]
    return out_lst

# Make Predictions!

- 'rf84.sav' is an 84% accurate random forest model. 
- 'tf84.sav' is the vectorizer formulated to work for the 84% rf, with 5000 rows. Can fit into other things with 500 features. 

In [None]:
string = "fuck this book"

In [None]:
predict_one(string, 'rf84.sav', 'tf84.sav')