# Imports and loads

In [2]:
import pickle
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
def save_model(model,name):
    file_ext= '.sav'
    path = 'models/'
    pickle.dump(model, open(path+name+file_ext, 'wb'))
    
def predict_one(string, model_name, vectorizor_name):
    
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform([string])

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

def predict_many(review_list, model_name, vectorizor_name):
    path = 'models/'
        
    tfid = pickle.load(open(path+vectorizor_name, 'rb'))
    tfidfed = tfid.transform(review_list)

    model = pickle.load(open(path+model_name, 'rb'))
    return model.predict(tfidfed)

###### Load things stored from other pipes

In [4]:
arr  = np.load('data/english_arr.npy', allow_pickle = True)

In [26]:
ngr  = np.load('data/2grams.npy', allow_pickle = True)

# Vectorize and Stem, or: Pre-handling

In [14]:
lang = arr[:,3]
ratings = arr[:,0]


In [24]:
# good vs bad rating 4,5 will be good (1), -1, 1,2,3 will be bad (0)

y = ratings
gvb = []

for i in y:
    if i <=4:
        gvb.append(0)
    else:
        gvb.append(1)
        

np.save('gvbtrain.npy', gvb)

In [25]:
len(gvb)

39644

## Sub-Selections
gvb is a training set that will let the model predict good/bad valences, if a rating is 4 or 5, gvb = 1

FourFive is a sub-selection of the whole dataset with just the 4 and five ratings, to try and train a this is best model


In [32]:
tfid = TfidfVectorizer(stop_words ='english', lowercase = False, max_features = 5000)
tfidfed = tfid.fit_transform(lang)

X = tfidfed
y = ratings.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

In [27]:
sngr = [' '.join(item) for item in ngr]
    

In [28]:
ngramTFV = TfidfVectorizer(lowercase = False, max_features = 5000)
ngramX = ngramTFV.fit_transform(sngr)




In [18]:
gvb  = np.load('gvbtrain.npy', allow_pickle = True)

X_train, X_test, y_train, y_test = train_test_split(ngramX, gvb, test_size=0.33)

# Various Models

In [19]:

knn_class = KNeighborsClassifier(n_neighbors = 18, n_jobs =-1)
knn_class.fit(X_train, y_train)
knn_class.score(X_test, y_test)

0.6639914392723382

#### An early random forest!

In [36]:
forest1 = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 100)
forest1.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [37]:
forest1.oob_score_

0.8460296640096862

In [None]:
save_model(tfid, 'tf84')

Random forest with more trees, mowed down.

In [33]:
manysaplings = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
manysaplings.fit(X, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [34]:
manysaplings.oob_score_

0.8533699929371406

Random forest on Ngrams

In [29]:
ngramforest = RandomForestClassifier(min_samples_split = 10, oob_score=True, n_jobs =-1, n_estimators = 1000)
ngramforest.fit(ngramX, gvb)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [30]:
ngramforest.oob_score_

0.8543537483604077

### Gradient Boosted Regressors

In [64]:
len(gvb)

25754

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, gvb, test_size=0.33)

grad_boost = GradientBoostingRegressor()
grad_boost.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [39]:
report_score(grad_boost, X_test, y_test)

Training score: 0.2316075709727793, Testing score: 0.18810983656180602


NameError: name 'confusion_matrix' is not defined

In [36]:
def report_score(model, X_test, y_test):
    '''
    a function that reports the accuracy of the model.
    Attributes:
    models (lst): a list of instansiated models to test
    Returns:
    out array, model name, training score, testing score, precision, recall
    '''
    
    
    

    
    
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)
    print('Training score: {}, Testing score: {}'.format(training_score, testing_score))
    tn, fp, fn, tp = confusion_matrix(y_test,model.predict(X_test)).ravel()
    precision = tp/(fp+tp)
    recall = tp/(fn+tp)
    print('tn', '  fp', '  fn', '  tp')
    print(tn, fp, fn, tp)
    print('precision: '+str(precision), 'recall: '+ str(recall))
    out_lst = [model,training_score, testing_score, precision, recall]
    return out_lst

# Make Predictions!

- 'rf84.sav' is an 84% accurate random forest model. 
- 'tf84.sav' is the vectorizer formulated to work for the 84% rf, with 5000 rows. Can fit into other things with 500 features. 

In [None]:
string = '''I got the most expensive one. It seemed like the dream vacuum. I am not rich, but i like to invest in something that I really like. This vacuum seemed so perfect, but it just seemed like it. 2 weeks after use, the tube broke, and i freaked out. Amazon didn’t have any direct contact for warranty, it just gave me the number for dyson customer service. I called, and the costumer service was rough at the beginning asking unnecessary questions. And asking why did I purchase it from Amazon and not directly from Dyson. I said, that Dyson were the ones selling it from Amazon, and she said NO, and that Amazon gets it from them, and then they sells them. As if, they wete lower quality or something. After that, I almost felt obligated to ask for the manufacturer warranty. The whole conversation was awkward and finally she said okay. And registered the product, and helped me get a replacement which took about a week to get to me. Also, i don’t like the way the way the thing to release the dust opens. It gets the whole thing dirty and it’s a fail in my opinion. If I could go back in time, i wouldnt purchase it.'''

In [55]:
predict_one(string, 'rf84.sav', 'tf84.sav')

array([0])