In [1]:
import json
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
# from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import warnings # supress warnings
warnings.filterwarnings('ignore')

## Split Data

In [2]:
df = pd.read_csv('filted_nv.csv')

In [3]:
df[['date','hours']].head()

Unnamed: 0,date,hours
0,2011-03-17 03:41:10,"{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."
1,2010-04-03 22:55:37,"{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ..."
2,2010-04-03 23:13:29,"{'Monday': '6:0-14:30', 'Tuesday': '6:0-14:30'..."
3,2019-03-23 01:12:17,"{'Monday': '4:30-21:0', 'Tuesday': '4:30-21:0'..."
4,2017-11-05 21:45:35,"{'Monday': '6:0-13:0', 'Tuesday': '6:0-13:0', ..."


In [6]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year

In [7]:
df[['year','review_id']].groupby(by='year').count()

Unnamed: 0_level_0,review_id
year,Unnamed: 1_level_1
2005,112
2006,1045
2007,3972
2008,9350
2009,15472
2010,30319
2011,49161
2012,56268
2013,74437
2014,112741


In [9]:
df[df['year']==2018].to_csv('filted_nv_2018.csv')
df[df['year']==2017].to_csv('filted_nv_2017.csv')
df[df['year']==2019].to_csv('filted_nv_2019.csv')

In [13]:
df[['attributes','categories','hours','yelping_since','elite']].head()

Unnamed: 0,attributes,categories,hours,yelping_since,elite
0,"{'RestaurantsReservations': 'True', 'Alcohol':...","Nightlife, Bars, Restaurants, Burgers, America...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...",2009-03-01 06:03:45,2011201220132014.0
1,"{'RestaurantsReservations': 'True', 'Restauran...","Restaurants, Thai","{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ...",2009-03-01 06:03:45,2011201220132014.0
2,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': ""...","Cafes, Restaurants, American (New), Breakfast ...","{'Monday': '6:0-14:30', 'Tuesday': '6:0-14:30'...",2009-03-01 06:03:45,2011201220132014.0
3,"{'BikeParking': 'False', 'RestaurantsReservati...","Diners, Restaurants, Burgers, American (Tradit...","{'Monday': '4:30-21:0', 'Tuesday': '4:30-21:0'...",2009-08-31 03:13:14,
4,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...","American (New), Breakfast & Brunch, American (...","{'Monday': '6:0-13:0', 'Tuesday': '6:0-13:0', ...",2017-11-05 14:33:43,


## Read Data

In [2]:
df = pd.read_csv('filted_nv_2018.csv')
df_19 = pd.read_csv('filted_nv_2019.csv')

## Data Preprocess

In [3]:
import numpy as np
df['label'] = np.where((df['useful_r']+df['funny_r']+df['cool_r']) > 2, 1, 0)
df_19['label'] = np.where((df_19['useful_r']+df_19['funny_r']+df_19['cool_r']) > 2, 1, 0)

df['yelping_since_month'] = (pd.to_datetime(df['date']).dt.year-pd.to_datetime(df['yelping_since']).dt.year)*12+ (pd.to_datetime(df['date']).dt.month-pd.to_datetime(df['yelping_since']).dt.month)
df_19['yelping_since_month'] = (pd.to_datetime(df_19['date']).dt.year-pd.to_datetime(df_19['yelping_since']).dt.year)*12+ (pd.to_datetime(df_19['date']).dt.month-pd.to_datetime(df_19['yelping_since']).dt.month)

df['elite_u'] = df['elite'].apply(lambda x: 1 if pd.notnull(x) else 0)
df_19['elite_u'] = df_19['elite'].apply(lambda x: 1 if pd.notnull(x) else 0)

df['elite_count'] = df['elite'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)
df_19['elite_count'] = df_19['elite'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254313 entries, 0 to 254312
Data columns (total 52 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           254313 non-null  int64  
 1   Unnamed: 0.1         254313 non-null  int64  
 2   Unnamed: 0.1.1       254313 non-null  int64  
 3   Unnamed: 0_x         254313 non-null  int64  
 4   review_id            254313 non-null  object 
 5   user_id              254313 non-null  object 
 6   business_id          254313 non-null  object 
 7   stars_r              254313 non-null  int64  
 8   useful_r             254313 non-null  int64  
 9   funny_r              254313 non-null  int64  
 10  cool_r               254313 non-null  float64
 11  text                 254313 non-null  object 
 12  date                 254313 non-null  object 
 13  Unnamed: 0_y         254313 non-null  float64
 14  name_b               254313 non-null  object 
 15  address          

In [5]:
continuous_feature = ['stars_r', 'latitude',
       'longitude',
       'stars_b', 'review_count_b','funny_u',
       'review_count_u', 'compliment_hot', 'compliment_funny',
       'compliment_plain','fans','compliment_photos', 'compliment_profile',
       'useful_u', 'average_stars', 'cool_u',
       'compliment_cute', 'compliment_cool',
       'compliment_list', 'compliment_writer', 'compliment_more',
       'compliment_note','yelping_since_month','elite_u', 'elite_count']
text_feature = ['text','city','attributes','categories']

In [6]:
train_set = df
test_set = df_19
del df
del df_19

In [7]:
y_train = train_set['label']
y_test = test_set['label']

In [8]:
  scaler = preprocessing.StandardScaler().fit(train_set[continuous_feature])
  X_train_pre = scaler.transform(train_set[continuous_feature])
  X_test_pre = scaler.transform(test_set[continuous_feature])

## Define Functions

In [9]:
def evaluation(y_true, y_pred, probs):
    print('Confusion Matrix: '+'\n',metrics.confusion_matrix(y_true, y_pred))
    print('Accuracy: ', metrics.accuracy_score(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))
    roc_auc = metrics.roc_auc_score(y_true, probs[:,1])
    print('AUC: ', roc_auc)

In [10]:
# def fit_evaluate(model, X, y, x_test, y_true):
def fit_evaluate(model, X, y, x_test, y_true, output='yes'):
    model.fit(X,y)
    y_pred = model.predict(x_test)
    prob = model.predict_proba(x_test)
    
    print('Confusion Matrix: \n', metrics.confusion_matrix(y_true, y_pred))
    acc = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    print('Accuracy: ', metrics.accuracy_score(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))
    roc_auc = metrics.roc_auc_score(y_true, prob[:,1])
    print("AUC:", roc_auc)
    if output == 'yes':
        return y_pred, prob, acc, precision, recall, f1, roc_auc
    else:
        return None

In [11]:
def cross_val(model, parameter, X, y):
    tuned = GridSearchCV(model, parameter, verbose=2)
    tuned.fit(X,y)
    print("Best score: {:.2%}".format(tuned.best_score_))
    print("Best Hyperparameters:{}".format(tuned.best_params_))  

In [None]:
# Confusion Matrix
def standard_confusion_matrix(y_true, y_pred):
    '''
    Reformat confusion matrix output from sklearn for plotting profit curve.
    '''
    [[tn, fp], [fn, tp]] = metrics.confusion_matrix(y_true, y_pred)
    return np.array([[tp, fp], [fn, tn]])

def plot_confusion_matrix(y_real, y_pred, Model):
    cm = standard_confusion_matrix(y_real, y_pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='g', cmap="YlGnBu",xticklabels=[1,0], yticklabels=[1,0])
    ax.set_xlabel('True labels')
    ax.set_ylabel('Predicted labels')
    plt.title('{} Confusion Matrix'.format(Model))

In [None]:
# ROC Curve
def ROC_curve(probs, test, model_name):
    auc = [0 for i in range(len(probs))]
    fpr = [0 for i in range(len(probs))]
    tpr = [0 for i in range(len(probs))]

    for i in range(len(probs)):
        # calculate scores
        auc[i] = metrics.roc_auc_score(test, probs[i][:, 1])
        # summarize scores
        print('Model %s: ROC AUC=%.3f' % (model_name[i], auc[i]))
        # calculate roc curves
        fpr[i], tpr[i], _ = metrics.roc_curve(test, probs[i][:, 1])
        # plot the roc curve for the model
        plt.plot(fpr[i], tpr[i], label='Model %s' % (model_name[i]))
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    plt.title('ROC Curve')
    # show the plot
    plt.show()

# Continuous Features - Model

## NB

In [12]:
nb_1 = MultinomialNB()
nb_1.fit(train_set[continuous_feature],train_set['label'])
y_1 = nb_1.predict(test_set[continuous_feature])
y_1_prob = nb_1.predict_proba(test_set[continuous_feature])

In [13]:
evaluation(y_test,y_1,y_1_prob)

Confusion Matrix: 
 [[200066   8949]
 [ 22734  15249]]
Accuracy:  0.871727706297217
              precision    recall  f1-score   support

           0       0.90      0.96      0.93    209015
           1       0.63      0.40      0.49     37983

    accuracy                           0.87    246998
   macro avg       0.76      0.68      0.71    246998
weighted avg       0.86      0.87      0.86    246998

AUC:  0.6885209831485246


In [14]:
# hyperparameters tuning

tuned_parameters= {
    'alpha': np.linspace(0.1, 0.5, 5),
}

nb_tuned = GridSearchCV(MultinomialNB(), tuned_parameters, verbose=1)
nb_tuned.fit(train_set[continuous_feature],train_set['label'])

print("Best score: {:.2%}".format(nb_tuned.best_score_))
print("Best Hyperparameters:{}".format(nb_tuned.best_params_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best score: 85.84%
Best Hyperparameters:{'alpha': 0.1}


In [15]:
nb_new = MultinomialNB(alpha=0.1)
# y_pred, prob, acc, precision, recall, f1, roc_auc
mnb_pred, mnb_prob, mnb_acc, mnb_precision, mnb_recall, mnb_f1, mnb_auc = fit_evaluate(nb_new,train_set[continuous_feature],train_set['label'], test_set[continuous_feature], y_test)

Confusion Matrix: 
 [[200066   8949]
 [ 22734  15249]]
Accuracy:  0.871727706297217
              precision    recall  f1-score   support

           0       0.90      0.96      0.93    209015
           1       0.63      0.40      0.49     37983

    accuracy                           0.87    246998
   macro avg       0.76      0.68      0.71    246998
weighted avg       0.86      0.87      0.86    246998

AUC: 0.6885209888797128


In [16]:
nb_2 = GaussianNB()
nb_2.fit(X_train_pre,y_train)
y_2 = nb_2.predict(X_test_pre)
y_2_prob = nb_2.predict_proba(X_test_pre)
evaluation(y_test,y_2,y_2_prob)

Confusion Matrix: 
 [[208268    747]
 [ 31189   6794]]
Accuracy:  0.8707034065053159
              precision    recall  f1-score   support

           0       0.87      1.00      0.93    209015
           1       0.90      0.18      0.30     37983

    accuracy                           0.87    246998
   macro avg       0.89      0.59      0.61    246998
weighted avg       0.87      0.87      0.83    246998

AUC:  0.7708256691679266


In [17]:
nb_2_parameter = {
    'var_smoothing': [1e-6, 1e-7, 1e-8]
}
cross_val(nb_2, nb_2_parameter, X_train_pre, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ................................var_smoothing=1e-06; total time=   0.2s
[CV] END ................................var_smoothing=1e-06; total time=   0.1s
[CV] END ................................var_smoothing=1e-06; total time=   0.1s
[CV] END ................................var_smoothing=1e-06; total time=   0.1s
[CV] END ................................var_smoothing=1e-06; total time=   0.1s
[CV] END ................................var_smoothing=1e-07; total time=   0.1s
[CV] END ................................var_smoothing=1e-07; total time=   0.1s
[CV] END ................................var_smoothing=1e-07; total time=   0.1s
[CV] END ................................var_smoothing=1e-07; total time=   0.1s
[CV] END ................................var_smoothing=1e-07; total time=   0.1s
[CV] END ................................var_smoothing=1e-08; total time=   0.1s
[CV] END ................................var_smoo

In [18]:
nb_gaussian = GaussianNB(var_smoothing=1e-06)
# y_pred, prob, acc, precision, recall, f1, roc_auc
gnb_pred, gnb_prob, gnb_acc, gnb_precision, gnb_recall, gnb_f1, gnb_auc = fit_evaluate(nb_gaussian,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[208268    747]
 [ 31189   6794]]
Accuracy:  0.8707034065053159
              precision    recall  f1-score   support

           0       0.87      1.00      0.93    209015
           1       0.90      0.18      0.30     37983

    accuracy                           0.87    246998
   macro avg       0.89      0.59      0.61    246998
weighted avg       0.87      0.87      0.83    246998

AUC: 0.7708254283320572


## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
fit_evaluate(lr,X_train_pre,y_train,X_test_pre,y_test,output='no')

Confusion Matrix: 
 [[206594   2421]
 [ 27775  10208]]
Accuracy:  0.8777479979594977
              precision    recall  f1-score   support

           0       0.88      0.99      0.93    209015
           1       0.81      0.27      0.40     37983

    accuracy                           0.88    246998
   macro avg       0.84      0.63      0.67    246998
weighted avg       0.87      0.88      0.85    246998

AUC: 0.7714010207066266


In [20]:
lr_parameter = {
    'C': [1, 5, 10, 50, 100, 200, 300]
}
cross_val(lr, lr_parameter, X_train_pre, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END ................................................C=1; total time=   1.3s
[CV] END ................................................C=1; total time=   1.5s
[CV] END ................................................C=1; total time=   1.3s
[CV] END ................................................C=1; total time=   1.3s
[CV] END ................................................C=1; total time=   1.2s
[CV] END ................................................C=5; total time=   1.3s
[CV] END ................................................C=5; total time=   1.3s
[CV] END ................................................C=5; total time=   1.4s
[CV] END ................................................C=5; total time=   1.3s
[CV] END ................................................C=5; total time=   1.3s
[CV] END ...............................................C=10; total time=   1.3s
[CV] END ........................................

In [21]:
lr_new = LogisticRegression(C = 5)
lr_pred, lr_prob, lr_acc, lr_precision, lr_recall, lr_f1, lr_auc = fit_evaluate(lr_new,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[206621   2394]
 [ 27809  10174]]
Accuracy:  0.8777196576490498
              precision    recall  f1-score   support

           0       0.88      0.99      0.93    209015
           1       0.81      0.27      0.40     37983

    accuracy                           0.88    246998
   macro avg       0.85      0.63      0.67    246998
weighted avg       0.87      0.88      0.85    246998

AUC: 0.7712056589823959


## SVM

In [22]:
## Too long to run.. > <
from sklearn.svm import SVR, SVC
svm = SVC(random_state=0, kernel = 'linear')
fit_evaluate(svm,X_train_pre,y_train,X_test_pre,y_test,output='no')

In [None]:
svm_parameters = {
    'C': [0.1, 1, 50, 100],
    'kernel':['rbf','linear','sigmoid']}
# cross_val(svm, svm_parameters, X_train_pre, y_train)

## KNN

In [22]:
knn = KNeighborsClassifier()
knn_pred, knn_prob, knn_acc, knn_precision, knn_recall, knn_f1, knn_auc = fit_evaluate(knn,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[204039   4976]
 [ 26349  11634]]
Accuracy:  0.873177110745836
              precision    recall  f1-score   support

           0       0.89      0.98      0.93    209015
           1       0.70      0.31      0.43     37983

    accuracy                           0.87    246998
   macro avg       0.79      0.64      0.68    246998
weighted avg       0.86      0.87      0.85    246998

AUC: 0.7154233444685851


## Random Forest

In [23]:
rf = RandomForestClassifier(random_state=0)
fit_evaluate(rf,X_train_pre,y_train,X_test_pre,y_test,output='no')

Confusion Matrix: 
 [[205029   3986]
 [ 23419  14564]]
Accuracy:  0.8890476845966364
              precision    recall  f1-score   support

           0       0.90      0.98      0.94    209015
           1       0.79      0.38      0.52     37983

    accuracy                           0.89    246998
   macro avg       0.84      0.68      0.73    246998
weighted avg       0.88      0.89      0.87    246998

AUC: 0.8762346028406066


In [24]:
rf_parameters= { 
    'n_estimators': [10, 50, 100],
    'max_features': ['auto', 'log2'],
    'max_depth' : [3, 5, 10],
    'criterion' :['gini', 'entropy']
}
cross_val(rf, rf_parameters, X_train_pre, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=10; total time=   0.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=10; total time=   0.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=10; total time=   0.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=10; total time=   0.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=10; total time=   0.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=50; total time=   3.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=50; total time=   3.9s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=50; total time=   3.8s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=50; total time=   4.4s
[CV] END criterion=gini, max_depth=3, max_features=auto, n_estimators=50; total time=   3.9s
[CV] END

In [25]:
rf_new = RandomForestClassifier(random_state=0, n_estimators = 50, max_features = 'auto', max_depth = 3,  criterion = 'entropy')
rf_pred, rf_prob, rf_acc, rf_precision, rf_recall, rf_f1, rf_auc = fit_evaluate(rf_new,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[206551   2464]
 [ 26219  11764]]
Accuracy:  0.8838735536320133
              precision    recall  f1-score   support

           0       0.89      0.99      0.94    209015
           1       0.83      0.31      0.45     37983

    accuracy                           0.88    246998
   macro avg       0.86      0.65      0.69    246998
weighted avg       0.88      0.88      0.86    246998

AUC: 0.8439144052970504


## XGBoost

In [34]:
xgb_model = XGBClassifier(eval_metric='auc')
fit_evaluate(xgb_model,X_train_pre,y_train,X_test_pre,y_test,output='no')

Confusion Matrix: 
 [[205267   3748]
 [ 22688  15295]]
Accuracy:  0.8929707932857756
              precision    recall  f1-score   support

           0       0.90      0.98      0.94    209015
           1       0.80      0.40      0.54     37983

    accuracy                           0.89    246998
   macro avg       0.85      0.69      0.74    246998
weighted avg       0.89      0.89      0.88    246998

AUC: 0.8974304253719017


In [35]:
FI = pd.Series(xgb_model.feature_importances_)
FI = FI.sort_values(ascending = False)
print(FI)

13    0.378060
6     0.113369
3     0.073504
21    0.060331
0     0.056327
4     0.056061
11    0.042228
9     0.040592
2     0.022708
1     0.021692
8     0.019044
17    0.013435
20    0.013139
5     0.012716
12    0.012165
16    0.011772
10    0.011595
14    0.011508
18    0.010090
19    0.009996
7     0.009672
15    0.000000
dtype: float32


In [39]:
from sklearn.feature_selection import SelectFromModel
from numpy import sort

# Fit model using each importance as a threshold
thresholds = sort(xgb_model.feature_importances_)

for i in np.arange(len(thresholds)-1, 1, -1):
    thresh = thresholds[i]
    # select features using threshold
    selection = SelectFromModel(xgb_model, threshold=thresh, prefit=True)
    select_x_train = selection.transform(X_train_pre)
    # train model
    selection_model = XGBClassifier(eval_metric='logloss')
    selection_model.fit(select_x_train, y_train)
    # eval model
    select_x_val = selection.transform(X_test_pre)
    predictions = selection_model.predict(select_x_val)
    probs = selection_model.predict_proba(select_x_val)
    roc_auc = metrics.roc_auc_score(y_test, probs[:,1])
    accuracy = metrics.accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%, AUC: %.3f%%" % (thresh, select_x_train.shape[1], accuracy*100.0, roc_auc*100.0))

Thresh=0.378, n=1, Accuracy: 88.52%, AUC: 83.932%
Thresh=0.113, n=2, Accuracy: 88.51%, AUC: 83.926%
Thresh=0.074, n=3, Accuracy: 88.45%, AUC: 84.215%
Thresh=0.060, n=4, Accuracy: 88.53%, AUC: 84.198%
Thresh=0.056, n=5, Accuracy: 88.50%, AUC: 85.327%
Thresh=0.056, n=6, Accuracy: 88.85%, AUC: 87.792%
Thresh=0.042, n=7, Accuracy: 89.00%, AUC: 88.934%
Thresh=0.041, n=8, Accuracy: 89.02%, AUC: 88.901%
Thresh=0.023, n=9, Accuracy: 89.07%, AUC: 89.347%
Thresh=0.022, n=10, Accuracy: 89.10%, AUC: 89.515%
Thresh=0.019, n=11, Accuracy: 89.08%, AUC: 89.534%
Thresh=0.013, n=12, Accuracy: 89.11%, AUC: 89.503%
Thresh=0.013, n=13, Accuracy: 89.23%, AUC: 89.619%
Thresh=0.013, n=14, Accuracy: 89.26%, AUC: 89.649%
Thresh=0.012, n=15, Accuracy: 89.34%, AUC: 89.768%
Thresh=0.012, n=16, Accuracy: 89.29%, AUC: 89.728%
Thresh=0.012, n=17, Accuracy: 89.30%, AUC: 89.791%
Thresh=0.012, n=18, Accuracy: 89.33%, AUC: 89.740%
Thresh=0.010, n=19, Accuracy: 89.24%, AUC: 89.722%
Thresh=0.010, n=20, Accuracy: 89.28%, AU

In [42]:
FI = pd.Series(xgb_model.feature_importances_, index = list(train_set[continuous_feature].columns)) 
FI = FI.sort_values(ascending = False).head(17)
top_features = list(FI.index)
top_features

['cool_u',
 'compliment_funny',
 'funny_u',
 'elite_u',
 'stars_r',
 'review_count_u',
 'useful_u',
 'compliment_photos',
 'review_count_b',
 'stars_b',
 'fans',
 'compliment_writer',
 'yelping_since_month',
 'compliment_hot',
 'average_stars',
 'compliment_list',
 'compliment_profile']

In [27]:
xgb_parameters = {
        'n_estimators': [10, 50, 150],
        'learning_rate': [0.01, 0.1, 0.3],
         'gamma': [0.5, 1, 5],
        'max_depth': [3, 6, 10]
        }
cross_val(xgb_model, xgb_parameters, X_train_pre, y_train)

ate=0.1, max_depth=10, n_estimators=50; total time=   5.6s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=50; total time=   5.0s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=50; total time=   5.3s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=50; total time=   5.6s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=150; total time=  15.0s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=150; total time=  15.2s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=150; total time=  14.2s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=150; total time=  15.3s
[CV] END gamma=5, learning_rate=0.1, max_depth=10, n_estimators=150; total time=  14.7s
[CV] END gamma=5, learning_rate=0.3, max_depth=3, n_estimators=10; total time=   0.5s
[CV] END gamma=5, learning_rate=0.3, max_depth=3, n_estimators=10; total time=   0.4s
[CV] END gamma=5, learning_rate=0.3, max_depth=3, n_estimators=10; t

In [29]:
xgb_new = XGBClassifier(random_state=0, n_estimators = 150, learning_rate = 0.1, max_depth = 10,  gamma = 1)
xgb_pred, xgb_prob, xgb_acc, xgb_precision, xgb_recall, xgb_f1, xgb_auc = fit_evaluate(xgb_new,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[205171   3844]
 [ 22638  15345]]
Accuracy:  0.8927845569599754
              precision    recall  f1-score   support

           0       0.90      0.98      0.94    209015
           1       0.80      0.40      0.54     37983

    accuracy                           0.89    246998
   macro avg       0.85      0.69      0.74    246998
weighted avg       0.89      0.89      0.88    246998

AUC: 0.8969000539625389


## LightGBM

In [30]:
lgbm = LGBMClassifier(random_state=0)
fit_evaluate(lgbm,X_train_pre,y_train,X_test_pre,y_test,output='no')

Confusion Matrix: 
 [[206050   2965]
 [ 23329  14654]]
Accuracy:  0.8935456967262893
              precision    recall  f1-score   support

           0       0.90      0.99      0.94    209015
           1       0.83      0.39      0.53     37983

    accuracy                           0.89    246998
   macro avg       0.87      0.69      0.73    246998
weighted avg       0.89      0.89      0.88    246998

AUC: 0.8980664414129536


In [48]:
lgbm_parameters = {
        'n_estimators': [50, 100, 300],
        'num_leaves': [15, 31, 80],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [-1, 10, 30]
        }
cross_val(lgbm, lgbm_parameters, X_train_pre, y_train)

al time=   1.2s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=50, num_leaves=80; total time=   1.2s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=50, num_leaves=80; total time=   1.3s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=15; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=15; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=15; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=15; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=15; total time=   0.9s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=31; total time=   1.2s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=31; total time=   1.2s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=31; total time=   1.2s
[CV] END learning_rate=0.1, max_depth=10, n_es

In [50]:
lgbm_new = LGBMClassifier(random_state=0, n_estimators = 50, num_leaves = 15, max_depth = -1,  learning_rate = 0.1)
lgbm_pred, lgbm_prob, lgbm_acc, lgbm_precision, lgbm_recall, lgbm_f1, lgbm_auc = fit_evaluate(lgbm_new,X_train_pre,y_train,X_test_pre,y_test)

Confusion Matrix: 
 [[206407   2608]
 [ 24196  13787]]
Accuracy:  0.891480902679374
              precision    recall  f1-score   support

           0       0.90      0.99      0.94    209015
           1       0.84      0.36      0.51     37983

    accuracy                           0.89    246998
   macro avg       0.87      0.68      0.72    246998
weighted avg       0.89      0.89      0.87    246998

AUC: 0.8931410477834936


## CatBoost

In [46]:
cat = CatBoostClassifier()
fit_evaluate(cat,X_train_pre,y_train,X_test_pre,y_test,output='no')

g: 12.2s
622:	learn: 0.2240745	total: 20.1s	remaining: 12.2s
623:	learn: 0.2240496	total: 20.2s	remaining: 12.2s
624:	learn: 0.2240319	total: 20.2s	remaining: 12.1s
625:	learn: 0.2240127	total: 20.2s	remaining: 12.1s
626:	learn: 0.2239913	total: 20.3s	remaining: 12.1s
627:	learn: 0.2239621	total: 20.3s	remaining: 12s
628:	learn: 0.2239319	total: 20.3s	remaining: 12s
629:	learn: 0.2239085	total: 20.3s	remaining: 11.9s
630:	learn: 0.2238752	total: 20.4s	remaining: 11.9s
631:	learn: 0.2238603	total: 20.4s	remaining: 11.9s
632:	learn: 0.2238603	total: 20.4s	remaining: 11.8s
633:	learn: 0.2238473	total: 20.4s	remaining: 11.8s
634:	learn: 0.2238160	total: 20.5s	remaining: 11.8s
635:	learn: 0.2237917	total: 20.5s	remaining: 11.7s
636:	learn: 0.2237538	total: 20.5s	remaining: 11.7s
637:	learn: 0.2237370	total: 20.6s	remaining: 11.7s
638:	learn: 0.2237055	total: 20.6s	remaining: 11.6s
639:	learn: 0.2236964	total: 20.6s	remaining: 11.6s
640:	learn: 0.2236748	total: 20.6s	remaining: 11.6s
641:	le

In [49]:
cat_parameters = {'depth' : [5, 10],'learning_rate' : [0.01, 0.05, 0.1, 0.3]}
cross_val(cat, cat_parameters, X_train_pre, y_train)

4.5s	remaining: 9.09s
615:	learn: 0.2608861	total: 14.5s	remaining: 9.07s
616:	learn: 0.2608600	total: 14.6s	remaining: 9.05s
617:	learn: 0.2608339	total: 14.6s	remaining: 9.03s
618:	learn: 0.2608110	total: 14.6s	remaining: 9.01s
619:	learn: 0.2607938	total: 14.7s	remaining: 8.98s
620:	learn: 0.2607667	total: 14.7s	remaining: 8.96s
621:	learn: 0.2607513	total: 14.7s	remaining: 8.93s
622:	learn: 0.2607373	total: 14.7s	remaining: 8.9s
623:	learn: 0.2607167	total: 14.7s	remaining: 8.88s
624:	learn: 0.2606130	total: 14.8s	remaining: 8.85s
625:	learn: 0.2605958	total: 14.8s	remaining: 8.83s
626:	learn: 0.2605748	total: 14.8s	remaining: 8.8s
627:	learn: 0.2605550	total: 14.8s	remaining: 8.78s
628:	learn: 0.2605402	total: 14.8s	remaining: 8.76s
629:	learn: 0.2605204	total: 14.9s	remaining: 8.73s
630:	learn: 0.2604692	total: 14.9s	remaining: 8.71s
631:	learn: 0.2603825	total: 14.9s	remaining: 8.69s
632:	learn: 0.2603653	total: 14.9s	remaining: 8.66s
633:	learn: 0.2603449	total: 15s	remaining: 

In [51]:
cat_new = CatBoostClassifier(random_state=0, depth = 5, learning_rate = 0.01)
cat_pred, cat_prob, cat_acc, cat_precision, cat_recall, cat_f1, cat_auc = fit_evaluate(cat_new,X_train_pre,y_train,X_test_pre,y_test)

.2607513	total: 22.4s	remaining: 13.6s
622:	learn: 0.2607373	total: 22.4s	remaining: 13.5s
623:	learn: 0.2607167	total: 22.4s	remaining: 13.5s
624:	learn: 0.2606130	total: 22.4s	remaining: 13.5s
625:	learn: 0.2605958	total: 22.5s	remaining: 13.4s
626:	learn: 0.2605748	total: 22.5s	remaining: 13.4s
627:	learn: 0.2605550	total: 22.5s	remaining: 13.4s
628:	learn: 0.2605402	total: 22.6s	remaining: 13.3s
629:	learn: 0.2605204	total: 22.6s	remaining: 13.3s
630:	learn: 0.2604692	total: 22.6s	remaining: 13.2s
631:	learn: 0.2603825	total: 22.6s	remaining: 13.2s
632:	learn: 0.2603653	total: 22.7s	remaining: 13.1s
633:	learn: 0.2603449	total: 22.7s	remaining: 13.1s
634:	learn: 0.2603294	total: 22.7s	remaining: 13.1s
635:	learn: 0.2603152	total: 22.7s	remaining: 13s
636:	learn: 0.2602977	total: 22.8s	remaining: 13s
637:	learn: 0.2602663	total: 22.8s	remaining: 12.9s
638:	learn: 0.2602372	total: 22.8s	remaining: 12.9s
639:	learn: 0.2602184	total: 22.8s	remaining: 12.8s
640:	learn: 0.2601475	total: 

## Sampling

## Evaluation

In [52]:
# Define dataframe to store results
evaluation = pd.DataFrame(columns=['Accuracy', 'ROC', 'Precision', 'Recall', 'F1-score'],index=['lr','knn','rf','xgb','mnb','gnb','lgbm','cat'])
models=['lr','knn','rf','xgb','mnb','gnb','lgbm','cat']
for model in models:
     evaluation.loc[model] = [globals()[model + "_acc"], globals()[model + "_auc"], globals()[model + "_precision"],globals()[model + "_recall"],globals()[model + "_f1"]] 

In [53]:
evaluation

Unnamed: 0,Accuracy,ROC,Precision,Recall,F1-score
lr,0.87772,0.771206,0.809516,0.267857,0.402524
knn,0.873177,0.715423,0.700421,0.306295,0.426208
rf,0.883874,0.843914,0.82682,0.309718,0.450633
xgb,0.892785,0.8969,0.799677,0.403997,0.536801
mnb,0.871728,0.688521,0.630176,0.401469,0.490471
gnb,0.870703,0.770825,0.900942,0.178869,0.29848
lgbm,0.891481,0.893141,0.840927,0.362978,0.50708
cat,0.891056,0.893085,0.83882,0.360898,0.504666
