# Project 5: Group Project
#### Author: Adam Pardo, Brandon Bergeron, Eric Bayless, Ramesh Babu

### 03 - ML modeling  

Task: Comparing different Maching Learning models on our sample data containing ~400 restaurants

Information: 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import pickle

import warnings
warnings.filterwarnings('ignore')

## Functions

In [2]:
#--formatted printing for model scores

def print_scores(model):
    print(f'train score: {model.score(x_train, y_train)}')
    print(f'test score: {model.score(x_test, y_test)}')

## Reading in reviews and combining

In [3]:
# combines all reviews for each restaurant into one observation

#-read in reviews

df_reviews = pd.read_csv('../data/Las_Vegas_reviews.csv') #-- All reviews
#df_reviews = pd.read_csv('../data/Las_Vegas_400_reviews.csv') #-- Initial Sample


#--combine all reviews
df_revs_combined = df_reviews.groupby(['business_id', 'name', 
                               'address', 'city' ,
                               'state', 'postal_code', 
                               'latitude' ,'longitude' , 
                               'stars', 'review_count', 
                               'is_open', 'attributes', 'categories']).agg({'text': ' '.join})

#--reset index and add review length column for total 
df_revs_combined = df_revs_combined.reset_index()
df_revs_combined['review_wc'] = df_revs_combined['text'].str.split().str.len()

In [4]:
df_revs_combined.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,text,review_wc
0,-0RkJ_uIduNLWQrphbADRw,Rooster Boy Cafe,"2620 Regatta Dr, Ste 113",Las Vegas,NV,89128,36.207539,-115.268154,4.0,194,1,"{'WheelchairAccessible': 'True', 'RestaurantsP...","Coffee & Tea, Restaurants, Cafes, Food, Breakf...",Amazing food and service. So grateful for the ...,24200


## Baseline: 72% accuracy

In [5]:
df_revs_combined['is_open'].value_counts(normalize=True)

1    0.724548
0    0.275452
Name: is_open, dtype: float64

# Modeling

In [6]:
#-- TTS, stratifying for imbalanced y

x = df_revs_combined['text']
y = df_revs_combined['is_open']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=251)

In [7]:
#-- Preprocessors used in all models
cvect = CountVectorizer(stop_words='english', max_features=1000)
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
ss = StandardScaler(with_mean=False)

In [8]:
#-- Basic Logistic Regression to guage performance and get some feature imports for EDA

logr = LogisticRegression(n_jobs=-1, max_iter=1000)

pipe_logr = make_pipeline(cvect, ss, logr)
pipe_logr.fit(x_train, y_train)

print_scores(pipe_logr)

#-- Extracting features and imports
features = pipe_logr.named_steps.countvectorizer.get_feature_names()
coefs = pipe_logr.named_steps.logisticregression.coef_
coefs_df = pd.DataFrame({'importance': coefs[0]}, index = features)

neg_imports = coefs_df.sort_values('importance', ascending=False).tail(50)
pos_imports = coefs_df.sort_values('importance', ascending=False).head(50)

#-- Combine imports into one df
pos_neg_imports = pd.concat([pos_imports, neg_imports])

#with open('pos_neg_imports.data', 'wb') as fh:
    #pickle.dump(pos_neg_imports, fh)

train score: 1.0
test score: 0.7895927601809954


In [9]:
#-- instantiate basic versions of models to loop over

model_dict = {
    'LogisticRegression' : LogisticRegression(max_iter=1000, n_jobs=-1),
    'MultinomialNB' : MultinomialNB(),
    'RandomForest' : RandomForestClassifier(n_jobs=-1),
    'ExtraTrees' : ExtraTreesClassifier(n_jobs=-1),
    'K-NearestNeighbors' : KNeighborsClassifier(n_jobs=-1),
    'SVC' : SVC(),
    'AdaBoostClassifier' : AdaBoostClassifier(n_estimators=100),
    'GradientBoostingClassifier' : GradientBoostingClassifier()    
}

In [10]:
#--Loop over base models with both vectorizers and get train/test scores for each

df_models = {}

for key in model_dict.keys():  
    
    estimator = model_dict[key]
    
    #-- pipelines for both vectorizers
    pipe_cvect = make_pipeline(cvect, ss, estimator)
    pipe_tfidf = make_pipeline(tfidf, ss, estimator)

    #-- fit both 
    pipe_cvect.fit(x_train, y_train)
    pipe_tfidf.fit(x_train, y_train)
    
    
    #-- gets scores and adds to df_models
    
    df_models[f'{key}_1'] = {
        'preprocessing' : 'CountVectorizer',
        'train_score' : pipe_cvect.score(x_train, y_train),
        'test_score' : pipe_cvect.score(x_test, y_test)
    }
    
    df_models[f'{key}_2'] = {
        'preprocessing' : 'TfidfVectorizer',
        'train_score' : pipe_tfidf.score(x_train, y_train),
        'test_score' : pipe_tfidf.score(x_test, y_test)
    }
    
    
    #--Printing for progress
    #print(f'{key} with CountVectorizer:')
    #print_scores(pipe_cvect)
    #print()
    #print(f'{key} with TfidfVectorizer:')
    #print_scores(pipe_tfidf)
    #print('\n\n')

In [11]:
df_first_models = pd.DataFrame(df_models).T
df_first_models

Unnamed: 0,preprocessing,train_score,test_score
LogisticRegression_1,CountVectorizer,0.994721,0.780543
LogisticRegression_2,TfidfVectorizer,1.0,0.773756
MultinomialNB_1,CountVectorizer,0.728507,0.651584
MultinomialNB_2,TfidfVectorizer,0.711161,0.644796
RandomForest_1,CountVectorizer,0.319005,0.31448
RandomForest_2,TfidfVectorizer,1.0,0.766968
ExtraTrees_1,CountVectorizer,0.33635,0.309955
ExtraTrees_2,TfidfVectorizer,1.0,0.764706
K-NearestNeighbors_1,CountVectorizer,0.75641,0.723982
K-NearestNeighbors_2,TfidfVectorizer,0.800905,0.742081


In [12]:
#with open('../data/first_models.data', 'wb') as fh:
#    pickle.dump(df_first_models, fh)

# Add Tokenizer and Parameter Searching with GridSearchCV

Each model will be fit using the word vectorizer that perfomed best on initial test

In [13]:
from nltk.tokenize import RegexpTokenizer

In [14]:
#--Tokenizer for 

def split_lemmatize(text):
    'returns a lowercase lemmatized list of words'
    text_lower = text.lower()
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(x) for x in text_lower.split()])

tfidf_lem = TfidfVectorizer(preprocessor = split_lemmatize, stop_words = 'english', max_features=1000)
cvect_lem = CountVectorizer(preprocessor = split_lemmatize, stop_words = 'english', max_features=1000)

In [15]:
#--Preprocessing words for model speed

x_train_tf = tfidf_lem.fit_transform(x_train)
x_test_tf = tfidf_lem.transform(x_test)

x_train_cv = cvect_lem.fit_transform(x_train)
x_test_cv = cvect_lem.transform(x_test)

## Logistic Regression:

In [16]:
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)

pipe_logreg = make_pipeline(ss, logreg)

params = {
    'logisticregression__C' : [0, 1, 10, 100],
    'logisticregression__penalty' : ['l1', 'l2', 'elasticnet', 'none'],    
}

grid_logreg = GridSearchCV(pipe_logreg, params, n_jobs=-1)
grid_logreg.fit(x_train_cv, y_train)


print('Logistic Regression with GridSearch')
print(f'train score: {grid_logreg.score(x_train_cv, y_train)}')
print(f'test score: {grid_logreg.score(x_test_cv, y_test)}')
print(f'best params: {grid_logreg.best_params_}')

Logistic Regression with GridSearch
train score: 1.0
test score: 0.7692307692307693
best params: {'logisticregression__C': 0, 'logisticregression__penalty': 'none'}


## RandomForest

In [17]:
forest = RandomForestClassifier(n_jobs=-1)

pipe_forest = make_pipeline(ss, forest)

params_forest = {
    'randomforestclassifier__min_samples_leaf' : [3, 5, 10, 20],
    'randomforestclassifier__n_estimators' : [100, 200, 300],
    'randomforestclassifier__max_depth' : [5, 10, 20, None]
}

grid_forest = GridSearchCV(pipe_forest, params_forest, n_jobs=-1)

grid_forest.fit(x_train_tf, y_train)


print('RandomForest with GridSearch:')
print(f'train score: {grid_forest.score(x_train_tf, y_train)}')
print(f'test score: {grid_forest.score(x_test_tf, y_test)}')
print(f'best params: {grid_forest.best_params_}')

RandomForest with GridSearch:
train score: 0.9984917043740573
test score: 0.7647058823529411
best params: {'randomforestclassifier__max_depth': 20, 'randomforestclassifier__min_samples_leaf': 3, 'randomforestclassifier__n_estimators': 100}


## KNN

In [18]:
knn = KNeighborsClassifier(n_jobs=-1)

pipe_knn = make_pipeline(ss, knn)

params_knn = {
    'kneighborsclassifier__n_neighbors' : [3, 5, 7, 9, 15],
    'kneighborsclassifier__p' : [1, 2]
}

grid_knn = GridSearchCV(pipe_knn, params_knn, n_jobs=-1)

grid_knn.fit(x_train_cv, y_train)


print('RandomForest with GridSearch:')
print(f'train score: {grid_knn.score(x_train_cv, y_train)}')
print(f'test score: {grid_knn.score(x_test_cv, y_test)}')
print(f'best params: {grid_knn.best_params_}')

RandomForest with GridSearch:
train score: 0.782051282051282
test score: 0.7330316742081447
best params: {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__p': 1}


## SVC

In [19]:
svc = SVC()

pipe_svc = make_pipeline(ss, svc)

params_svc = {
    'svc__kernel' : ['rbf', 'sigmoid'],
    'svc__C' : [0, 1, 5, 10]
}

grid_svc = GridSearchCV(pipe_svc, params_svc, n_jobs=-1)
grid_svc.fit(x_train_tf, y_train)


print('SVC with GridSearch:')
print(f'train score: {grid_svc.score(x_train_tf, y_train)}')
print(f'test score: {grid_svc.score(x_test_tf, y_test)}')
print(f'best params: {grid_svc.best_params_}')

SVC with GridSearch:
train score: 1.0
test score: 0.8212669683257918
best params: {'svc__C': 5, 'svc__kernel': 'rbf'}


# Boosting

#### AdaBoost

In [20]:
adaboost = AdaBoostClassifier()

pipe_boost = make_pipeline(ss, adaboost)

params_boost = {
    'adaboostclassifier__n_estimators' : [50, 100, 200],
    'adaboostclassifier__learning_rate' : [1, 5, 10]
}

grid_boost = GridSearchCV(pipe_boost, params_boost, n_jobs=-1)
grid_boost.fit(x_train_tf, y_train)

print('AdaBoost with GridSearch')
print(f'train score: {grid_boost.score(x_train_tf, y_train)}')
print(f'test score: {grid_boost.score(x_test_tf, y_test)}')
print(f'best_params: {grid_boost.best_params_}')

AdaBoost with GridSearch
train score: 1.0
test score: 0.8009049773755657
best_params: {'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__n_estimators': 200}


#### GradientBoost

In [21]:
gboost = GradientBoostingClassifier()

pipe_gboost = make_pipeline(ss, gboost)

params_gboost = {
    'gradientboostingclassifier__min_samples_leaf' : [3, 6, 10, 20],
    'gradientboostingclassifier__max_depth' : [3, 5, 9],
    'gradientboostingclassifier__ccp_alpha' : [1, 10, 100]

}

grid_gboost = GridSearchCV(pipe_gboost, params_gboost, n_jobs=-1)
grid_gboost.fit(x_train_tf, y_train)

print('GradBoost with GridSearch')
print(f'train score: {grid_gboost.score(x_train_tf, y_train)}')
print(f'test score: {grid_gboost.score(x_test_tf, y_test)}')
print(f'best_params: {grid_gboost.best_params_}')

GradBoost with GridSearch
train score: 0.72473604826546
test score: 0.7239819004524887
best_params: {'gradientboostingclassifier__ccp_alpha': 1, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__min_samples_leaf': 3}


# Comparing GridSearch Performance 

In [22]:
#--Dictionary of all GridSearches for bilding DF of metrics

grid_dict = {
    'LogisticRegression' : {'preprocesser':'cv', 'model':grid_logreg},
    'RandomForest' : {'preprocesser':'tf', 'model':grid_forest},
    'KNN' : {'preprocesser':'cv', 'model':grid_knn},
    'SVC' : {'preprocesser':'tf', 'model':grid_svc},
    'AdaBoostClassifier' : {'preprocesser':'tf', 'model':grid_boost},
    'GradientBoostingClassifier' : {'preprocesser':'tf', 'model':grid_gboost}
}

metrics_dict = {}

In [23]:
#--Builds DF of metrics for all GridSearches

for grid in grid_dict.keys():
    if grid_dict[grid]['preprocesser'] == 'cv':
        tn, fp, fn, tp = confusion_matrix(y_test, grid_dict[grid]['model'].predict(x_test_cv)).flatten()
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        specificity = tn/(tn+fp)
        balanced_acc = (specificity+recall)/2
        metrics_dict[grid] = {
            'precision' : precision,
            'recall' : recall,
            'specificity' : specificity,
            'balanced accuracy' : balanced_acc,
            'accuracy' : grid_dict[grid]['model'].score(x_test_cv, y_test)
        }
    else:
        tn, fp, fn, tp = confusion_matrix(y_test, grid_dict[grid]['model'].predict(x_test_tf)).flatten()
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        specificity = tn/(tn+fp)
        balanced_acc = (specificity+recall)/2
        metrics_dict[grid] = {
            'precision' : precision,
            'recall' : recall,
            'specificity' : specificity,
            'balanced accuracy' : balanced_acc,
            'accuracy' : grid_dict[grid]['model'].score(x_test_tf, y_test)
        }

In [24]:
#--DF of all metrics for best models

df_metrics = pd.DataFrame(metrics_dict).T

#with open('../data/gridsearch_metrics.data', 'wb') as fh:
#    pickle.dump(df_metrics, fh)

In [25]:
df_metrics['improvement'] = df_metrics['accuracy'] - [.7805, .7669, .7421, .7964, .7986, .8303]
df_metrics

Unnamed: 0,precision,recall,specificity,balanced accuracy,accuracy,improvement
LogisticRegression,0.851613,0.825,0.622951,0.723975,0.769231,-0.011269
RandomForest,0.759615,0.9875,0.180328,0.583914,0.764706,-0.002194
KNN,0.748768,0.95,0.163934,0.556967,0.733032,-0.009068
SVC,0.847262,0.91875,0.565574,0.742162,0.821267,0.024867
AdaBoostClassifier,0.843195,0.890625,0.565574,0.728099,0.800905,0.002305
GradientBoostingClassifier,0.723982,1.0,0.0,0.5,0.723982,-0.106318
