In [21]:
# imports
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_colwidth = 200

In [2]:
df_train = pd.read_csv('../data/train_ready.csv')
df_test = pd.read_csv('../data/test_ready.csv')

In [3]:
df_train.drop('Unnamed: 0', axis=1, inplace=True)
df_test.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
# df_train.info(), df_test.info()

---

### MNB model

In [5]:
X =  df_train['Reviews']
y = df_train['Sentiment']

In [6]:
# split it, test it, fax, rename it ;)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [7]:
# MNB for the first one
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [8]:
# put up some params
params = {
    'cvec__max_features': [60_000], # tested 10,000 to 100,000
    'cvec__min_df': [2], # tested 2 to 3
    'cvec__max_df': [.9], # tested but... 
    'cvec__ngram_range':[(1,4)] # tested (1,1) to (1,4)
}

In [9]:
# instantiate gridsearch
gs = GridSearchCV(pipe, param_grid=params, cv=3)

In [10]:
# fit gridsearch
gs.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words='english')),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__max_df': [0.9], 'cvec__max_features': [60000],
                         'cvec__min_df': [2], 'cvec__ngram_range': [(1, 4)]})

In [11]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 60000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 4)}

In [12]:
gs.score(X_train, y_train)

0.9266133333333333

In [13]:
gs.score(X_test, y_test)

0.87136

In [14]:
# Note: score is .869 after hypertuning with stop_words

In [15]:
X_t = df_test['Reviews']
y_t = df_test['Sentiment']

In [16]:
gs.fit(X_t, y_t)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words='english')),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__max_df': [0.9], 'cvec__max_features': [60000],
                         'cvec__min_df': [2], 'cvec__ngram_range': [(1, 4)]})

In [17]:
gs.score(X_t, y_t)

0.91724

### RFC model

In [47]:
pipe_rfc = Pipeline ([
    ('cvec', CountVectorizer(stop_words='english')),
    ('rfc', RandomForestClassifier(n_estimators=500))
])

In [42]:
# # put up some params
# params_rfc = {
#     'rfc__n_estimators': [500] # started testing at 100
# }

In [43]:
# gs_rfc = GridSearchCV(pipe_rfc, param_grid=params_rfc, cv=3)

In [44]:
# gs_rfc.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words='english')),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'rfc__n_estimators': [300, 400, 500]})

In [45]:
# gs_rfc.score(X_train, y_train)

1.0

In [46]:
# gs_rfc.best_params_

{'rfc__n_estimators': 500}

**pipe testing - default RFC**

In [48]:
pipe_rfc.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer(stop_words='english')),
                ('rfc', RandomForestClassifier(n_estimators=500))])

In [49]:
pipe_rfc.score(X_train, y_train)

1.0

In [50]:
pipe_rfc.score(X_test, y_test)

0.86512

In [51]:
pipe_rfc.score(X_t, y_t)

0.96628

---

ready_test.csv log:
- MNB test model = .898
- MNB "tuned" = .917
- RFC default n_estimators (100) = .962
- RFC "tuned" = 0.966