# Project 5: Group Project
#### Author: Adam Pardo, Brandon Bergeron, Eric Bayless, Ramesh Babu

### 02 - ML modeling  

Task: Comparing different Maching Learning models on our sample data containing ~400 restaurants

Information: 

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import pickle

import warnings
warnings.filterwarnings('ignore')

## Functions

In [5]:
#--formatted printing for model scores

def print_scores(model):
    print(f'train score: {model.score(x_train, y_train)}')
    print(f'test score: {model.score(x_test, y_test)}')

In [7]:
#--- Getting feature importances- only for pipelines
def get_imports(pipe, transformer, estimator, imports):
    '''
    takes a pipeline, it's named transformer, it's named estimator, and # of important features desired
    
    
    Args:
        pipe: name of pipeline instance
        transformer (str): ex 'countvectorizer'
        estimator (str): ex. 'logisticregression', 'ridge'
        imports (int): number of important features desired
        
        
    '''
    features = pipe.named_steps[transformer].get_feature_names()
    coefs = pipe.named_steps[estimator].coef_
    coefs_df = pd.DataFrame({'importance': coefs[0]}, index = features)
    
    return coefs_df.sort_values('importance', ascending=False).head(imports)

## Reading in reviews and combining

In [8]:
# combines all reviews for each restaurant into one observation

#-read in reviews
#df_reviews = pd.read_csv('../data/Las_Vegas_400_reviews.csv')
df_reviews = pd.read_csv('../data/Las_Vegas_reviews.csv')

#--combine all reviews
df_revs_combined = df_reviews.groupby(['business_id', 'name', 
                               'address', 'city' ,
                               'state', 'postal_code', 
                               'latitude' ,'longitude' , 
                               'stars', 'review_count', 
                               'is_open', 'attributes', 'categories']).agg({'text': ' '.join})

#--reset index and add review length column for total 
df_revs_combined = df_revs_combined.reset_index()
df_revs_combined['review_wc'] = df_revs_combined['text'].str.split().str.len()

In [9]:
df_revs_combined.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,text,review_wc
0,-0RkJ_uIduNLWQrphbADRw,Rooster Boy Cafe,"2620 Regatta Dr, Ste 113",Las Vegas,NV,89128,36.207539,-115.268154,4.0,194,1,"{'WheelchairAccessible': 'True', 'RestaurantsP...","Coffee & Tea, Restaurants, Cafes, Food, Breakf...",Amazing food and service. So grateful for the ...,24200


## Baseline: 72% accuracy

In [10]:
df_revs_combined['is_open'].value_counts(normalize=True)

1    0.724548
0    0.275452
Name: is_open, dtype: float64

# Modeling

In [11]:
x = df_revs_combined['text']
y = df_revs_combined['is_open']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=251)

In [12]:
cvect = CountVectorizer(stop_words='english', max_features=500)
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
ss = StandardScaler(with_mean=False)

In [17]:
model_dict = {
    'LogisticRegression' : LogisticRegression(max_iter=1000, n_jobs=-1),
    'MultinomialNB' : MultinomialNB(),
    'RandomForest' : RandomForestClassifier(n_jobs=-1),
    'ExtraTrees' : ExtraTreesClassifier(n_jobs=-1),
    'K-NearestNeighbors' : KNeighborsClassifier(n_jobs=-1),
    'SVC' : SVC(),
    'AdaBoostClassifier' : AdaBoostClassifier(n_estimators=100),
    'GradientBoostingClassifier' : GradientBoostingClassifier()    
}

In [79]:
df_models = {}

for key in model_dict.keys():  
    
    estimator = model_dict[key]

    pipe_cvect = make_pipeline(cvect, ss, estimator)
    pipe_tfidf = make_pipeline(tfidf, ss, estimator)

    pipe_cvect.fit(x_train, y_train)
    pipe_tfidf.fit(x_train, y_train)
    
    df_models[f'{key}_1'] = {
        'preprocessing' : 'CountVectorizer',
        'train_score' : pipe_cvect.score(x_train, y_train),
        'test_score' : pipe_cvect.score(x_test, y_test)
    }
    
    df_models[f'{key}_2'] = {
        'preprocessing' : 'TfidfVectorizer',
        'train_score' : pipe_tfidf.score(x_train, y_train),
        'test_score' : pipe_tfidf.score(x_test, y_test)
    }
    
    

    print(f'{key} with CountVectorizer:')
    print_scores(pipe_cvect)
    print()
    print(f'{key} with TfidfVectorizer:')
    print_scores(pipe_tfidf)
    print('\n\n')

LogisticRegression with CountVectorizer:
train score: 0.9260935143288085
test score: 0.7760180995475113

LogisticRegression with TfidfVectorizer:
train score: 0.995475113122172
test score: 0.7579185520361991



MultinomialNB with CountVectorizer:
train score: 0.7187028657616893
test score: 0.6628959276018099

MultinomialNB with TfidfVectorizer:
train score: 0.702865761689291
test score: 0.6470588235294118



RandomForest with CountVectorizer:
train score: 0.27526395173454
test score: 0.27601809954751133

RandomForest with TfidfVectorizer:
train score: 1.0
test score: 0.7873303167420814



ExtraTrees with CountVectorizer:
train score: 0.30844645550527905
test score: 0.3009049773755656

ExtraTrees with TfidfVectorizer:
train score: 1.0
test score: 0.7782805429864253



K-NearestNeighbors with CountVectorizer:
train score: 0.7285067873303167
test score: 0.7330316742081447

K-NearestNeighbors with TfidfVectorizer:
train score: 0.8092006033182504
test score: 0.7239819004524887



SVC with C

In [80]:
df_first_models = pd.DataFrame(df_models).T
df_first_models

Unnamed: 0,preprocessing,train_score,test_score
LogisticRegression_1,CountVectorizer,0.926094,0.776018
LogisticRegression_2,TfidfVectorizer,0.995475,0.757919
MultinomialNB_1,CountVectorizer,0.718703,0.662896
MultinomialNB_2,TfidfVectorizer,0.702866,0.647059
RandomForest_1,CountVectorizer,0.275264,0.276018
RandomForest_2,TfidfVectorizer,1.0,0.78733
ExtraTrees_1,CountVectorizer,0.308446,0.300905
ExtraTrees_2,TfidfVectorizer,1.0,0.778281
K-NearestNeighbors_1,CountVectorizer,0.728507,0.733032
K-NearestNeighbors_2,TfidfVectorizer,0.809201,0.723982


In [81]:
#with open('../data/first_models.data', 'wb') as fh:
#    pickle.dump(df_first_models, fh)

# Add Tokenizer and Parameter Searching

In [22]:
from nltk.tokenize import RegexpTokenizer

In [31]:
def split_lemmatize(text):
    'returns a lowercase lemmatized list of words'
    text_lower = text.lower()
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(x) for x in text_lower.split()])

tfidf_lem = TfidfVectorizer(preprocessor = split_lemmatize, stop_words = 'english', max_features=750)
cvect_lem = CountVectorizer(preprocessor = split_lemmatize, stop_words = 'english', max_features=750)

In [32]:
x_train_tf = tfidf_lem.fit_transform(x_train)
x_test_tf = tfidf_lem.transform(x_test)

In [35]:
x_train_cv = cvect_lem.fit_transform(x_train)
x_test_cv = cvect_lem.transform(x_test)

## Logistic Regression:

In [36]:
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)

pipe_logreg = make_pipeline(ss, logreg)

params = {
    'logisticregression__C' : [0, 1, 10, 100],
    'logisticregression__penalty' : ['l1', 'l2', 'elasticnet', 'none'],    
}

grid_logreg = GridSearchCV(pipe_logreg, params, n_jobs=-1)
grid_logreg.fit(x_train_cv, y_train)


print('Logistic Regression with GridSearch')
print(f'train score: {grid_logreg.score(x_train_cv, y_train)}')
print(f'test score: {grid_logreg.score(x_test_cv, y_test)}')
print(f'best params: {grid_logreg.best_params_}')

Logistic Regression with GridSearch
train score: 1.0
test score: 0.7624434389140271
best params: {'logisticregression__C': 0, 'logisticregression__penalty': 'none'}


## RandomForest

In [41]:
forest = RandomForestClassifier(n_jobs=-1)

pipe_forest = make_pipeline(ss, forest)

params_forest = {
    'randomforestclassifier__min_samples_leaf' : [3, 5, 10, 20],
    'randomforestclassifier__n_estimators' : [100, 200, 300],
    'randomforestclassifier__max_depth' : [5, 10, 20, None]
}

grid_forest = GridSearchCV(pipe_forest, params_forest, n_jobs=-1)

grid_forest.fit(x_train_tf, y_train)


print('RandomForest with GridSearch:')
print(f'train score: {grid_forest.score(x_train_tf, y_train)}')
print(f'test score: {grid_forest.score(x_test_tf, y_test)}')
print(f'best params: {grid_forest.best_params_}')

RandomForest with GridSearch:
train score: 0.9992458521870287
test score: 0.7828054298642534
best params: {'randomforestclassifier__max_depth': 20, 'randomforestclassifier__min_samples_leaf': 3, 'randomforestclassifier__n_estimators': 100}


## KNN

In [43]:
knn = KNeighborsClassifier(n_jobs=-1)

pipe_knn = make_pipeline(ss, knn)

params_knn = {
    'kneighborsclassifier__n_neighbors' : [3, 5, 7, 9, 15],
    'kneighborsclassifier__p' : [1, 2]
}

grid_knn = GridSearchCV(pipe_knn, params_knn, n_jobs=-1)

grid_knn.fit(x_train_cv, y_train)


print('RandomForest with GridSearch:')
print(f'train score: {grid_knn.score(x_train_cv, y_train)}')
print(f'test score: {grid_knn.score(x_test_cv, y_test)}')
print(f'best params: {grid_knn.best_params_}')

RandomForest with GridSearch:
train score: 0.7835595776772247
test score: 0.751131221719457
best params: {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__p': 1}


## SVC

In [44]:
svc = SVC()

pipe_svc = make_pipeline(ss, svc)

params_svc = {
    'svc__kernel' : ['rbf', 'sigmoid'],
    'svc__C' : [0, 1, 5, 10]
}

grid_svc = GridSearchCV(pipe_svc, params_svc, n_jobs=-1)
grid_svc.fit(x_train_tf, y_train)


print('SVC with GridSearch:')
print(f'train score: {grid_svc.score(x_train_tf, y_train)}')
print(f'test score: {grid_svc.score(x_test_tf, y_test)}')
print(f'best params: {grid_svc.best_params_}')

SVC with GridSearch:
train score: 0.9992458521870287
test score: 0.8212669683257918
best params: {'svc__C': 5, 'svc__kernel': 'rbf'}


# Boosting

#### AdaBoost

In [45]:
adaboost = AdaBoostClassifier()

pipe_boost = make_pipeline(ss, adaboost)

params_boost = {
    'adaboostclassifier__n_estimators' : [50, 100, 200],
    'adaboostclassifier__learning_rate' : [1, 5, 10]
}

grid_boost = GridSearchCV(pipe_boost, params_boost, n_jobs=-1)
grid_boost.fit(x_train_tf, y_train)

print('AdaBoost with GridSearch')
print(f'train score: {grid_boost.score(x_train_tf, y_train)}')
print(f'test score: {grid_boost.score(x_test_tf, y_test)}')
print(f'best_params: {grid_boost.best_params_}')

AdaBoost with GridSearch
train score: 1.0
test score: 0.7873303167420814
best_params: {'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__n_estimators': 200}


#### GradientBoost

In [51]:
gboost = GradientBoostingClassifier()

pipe_gboost = make_pipeline(ss, gboost)

params_gboost = {
    'gradientboostingclassifier__min_samples_leaf' : [3, 6, 10, 20],
    'gradientboostingclassifier__max_depth' : [3, 5, 9],
    'gradientboostingclassifier__ccp_alpha' : [1, 10, 100]

}

grid_gboost = GridSearchCV(pipe_gboost, params_gboost, n_jobs=-1)
grid_gboost.fit(x_train_tf, y_train)

print('GradBoost with GridSearch')
print(f'train score: {grid_gboost.score(x_train_tf, y_train)}')
print(f'test score: {grid_gboost.score(x_test_tf, y_test)}')
print(f'best_params: {grid_gboost.best_params_}')

GradBoost with GridSearch
train score: 0.72473604826546
test score: 0.7239819004524887
best_params: {'gradientboostingclassifier__ccp_alpha': 1, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__min_samples_leaf': 3}


# Comparing GridSearch Performance 

In [64]:
#--Dictionary of all GridSearches for bilding DF of metrics

grid_dict = {
    'LogisticRegression' : {'preprocesser':'cv', 'model':grid_logreg},
    'RandomForest' : {'preprocesser':'tf', 'model':grid_forest},
    'KNN' : {'preprocesser':'cv', 'model':grid_knn},
    'SVC' : {'preprocesser':'tf', 'model':grid_svc},
    'AdaBoostClassifier' : {'preprocesser':'tf', 'model':grid_boost},
    'GradientBoostingClassifier' : {'preprocesser':'tf', 'model':grid_gboost}
}

metrics_dict = {}

In [54]:
grid_logreg.best_estimator_.named_steps['']

{'standardscaler': StandardScaler(with_mean=False),
 'logisticregression': LogisticRegression(C=0, max_iter=1000, n_jobs=-1, penalty='none')}

In [66]:
#--Builds DF of metrics for all GridSearches

for grid in grid_dict.keys():
    if grid_dict[grid]['preprocesser'] == 'cv':
        tn, fp, fn, tp = confusion_matrix(y_test, grid_dict[grid]['model'].predict(x_test_cv)).flatten()
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        specificity = tn/(tn+fp)
        balanced_acc = (specificity+recall)/2
        metrics_dict[grid] = {
            'precision' : precision,
            'recall' : recall,
            'specificity' : specificity,
            'balanced accuracy' : balanced_acc,
            'accuracy' : grid_dict[grid]['model'].score(x_test_cv, y_test)
        }
    else:
        tn, fp, fn, tp = confusion_matrix(y_test, grid_dict[grid]['model'].predict(x_test_tf)).flatten()
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        specificity = tn/(tn+fp)
        balanced_acc = (specificity+recall)/2
        metrics_dict[grid] = {
            'precision' : precision,
            'recall' : recall,
            'specificity' : specificity,
            'balanced accuracy' : balanced_acc,
            'accuracy' : grid_dict[grid]['model'].score(x_test_tf, y_test)
        }

In [73]:
#--DF of all metrics for best models

df_metrics = pd.DataFrame(metrics_dict).T
df_metrics

Unnamed: 0,precision,recall,specificity,balanced accuracy,accuracy
LogisticRegression,0.850163,0.815625,0.622951,0.719288,0.762443
RandomForest,0.778607,0.978125,0.270492,0.624308,0.782805
KNN,0.761194,0.95625,0.213115,0.584682,0.751131
SVC,0.847262,0.91875,0.565574,0.742162,0.821267
AdaBoostClassifier,0.834711,0.946875,0.508197,0.727536,0.825792
GradientBoostingClassifier,0.723982,1.0,0.0,0.5,0.723982


In [82]:
#with open('../data/gridsearch_metrics.data', 'wb') as fh:
#    pickle.dump(df_metrics, fh)