In [1]:
# Import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer 
# 
path = '../Data/'

In [2]:
df = pd.read_csv(path + 'df_cleaned.csv')

In [3]:
df.target.value_counts()

0    800000
1    800000
Name: target, dtype: int64

In [4]:
# checking for nulls
df.isnull().sum()

target           0
ids              0
tweet_date       0
flag             0
user             0
text             0
text_clean    8258
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
# Label ONLY has Positive(1) or Negative(0) on the target field so, this exercise is a binary classification problem.
df.target.value_counts()

0    796361
1    795381
Name: target, dtype: int64

In [7]:
pd.set_option("display.max_colwidth", 0)
df[['target','text', 'text_clean']].sample(5)

Unnamed: 0,target,text,text_clean
37213,0,"@liesforliars damn dude! Yeah its hard enough for me to keep up with everybody with only about 25 ppl to follow, workin 2 jobs.",damn dud yeah hard enough keep everybody ppl follow workin job
1584432,1,i`m wondering if @trohman `s pic is really him. the kiddie pic. coz i think it`s really adorable,wond pic real kiddy pic coz think real ad
1521965,1,@Epigrammist haha i know! i am looking into it,hah know look
302711,0,"@jimmyxc16 Pfft. You're dog does NOT beat my dog. My dog loves everyone, you're dog hates me",pfft dog not beat dog dog lov everyon dog hat
437645,0,Leaving new york today Really don't want to leave my friends behind.,leav new york today real not want leav friend behind


In [8]:
pd.Series(' '.join(df.text_clean).split()).value_counts()[:10]

not      292036
get      110917
day      108276
good     92504 
work     88161 
lik      84000 
lov      83818 
go       74000 
quot     73139 
today    68686 
dtype: int64

#### User functions

In [9]:
def scoring_model(model, X_train, X_test, y_train, y_test, y_pred):
    # Import
    from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve
    #
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, 
                                                                    model.predict_proba(X_train)[:,1])
    #
    tr_score = model.score(X_train, y_train)
    ts_score = model.score(X_test, y_test)
    acc = accuracy_score(y_test, y_pred)
    auc = auc(false_positive_rate, true_positive_rate)
    roc_tr = roc_auc_score(y_train, model.predict(X_train))
    roc_t = roc_auc_score(y_test, model.predict(X_test))
    return tr_score,ts_score,acc,auc,roc_tr,roc_t

In [10]:
# create empty DataFrame
log = pd.DataFrame()
# initial cm value
cm_ = np.array([[0,0], [0,0]])
# 
def logging_metrics(m, c, v, cm, training_score, \
                    test_score, auc_score, roc_auc_train, roc_auc_test, start, end):
    
    log_dict = {'Model': [],
                'Classifier': [],
                'Vectorizer': [],
                'TP': [],
                'FN': [],
                'FP': [],
                'TN': [],
                'Training_Score': [],
                'Test_Score': [],
                'AUC_Score': [],
                'ROC_AUC_Training': [],
                'ROC_AUC_Test': [],
                'Duration_Mins': []
                }

    log_dict['Model'].append(m)
    log_dict['Classifier'].append(c)
    log_dict['Vectorizer'].append(v)
    log_dict['TP'].append(cm[0,0])
    log_dict['FN'].append(cm[0,1])
    log_dict['FP'].append(cm[1,0])
    log_dict['TN'].append(cm[1,1])
    log_dict['Training_Score'].append(training_score)
    log_dict['Test_Score'].append(test_score)
    log_dict['AUC_Score'].append(auc_score)
    log_dict['ROC_AUC_Training'].append(roc_auc_train)
    log_dict['ROC_AUC_Test'].append(roc_auc_test)
    log_dict['ROC_AUC_Test'].append(roc_auc_test)
    log_dict['Duration_Mins'].append((end - start)/60)
    return log_dict

## Vectorizer

In [11]:
# import library
from sklearn.model_selection import train_test_split
# Getting tokenization of tweet text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# 
t_size = 0.20
seed = 11
# 

# TfidfVectorizer: unigrams and bigrams
parms_t = {'max_df' : 0.995,
           'min_df': 0.001,
           'ngram_range' : (1,2),
          }
# CountVectorizer
parms_c = {'max_df' : 0.995,
           'min_df': 0.001,
           'ngram_range': (1,2),
          }
# 

### Count Vectorizer with unigrams and bigrams

In [12]:
start = timer()
m, c, v = 'cv', 'unigrams and bigrams', 'Count Vectorizer'
# incode Count Vectorizer
cv = CountVectorizer(**parms_c)
# 
X = cv.fit_transform(df.text_clean).toarray()
y = df.target
# 
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y, test_size=t_size, random_state=seed)
# 
print('Count Vectorizer shape: ', X.shape)
# 
end = timer()
# Log matrics
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm_, 0, 0, 0, 0, 0, start, end), 
                                        orient='index').transpose())

Count Vectorizer shape:  (1591742, 1163)


### Tfidf Vectorizer with unigrams and bigrams

In [13]:
start = timer()
m, c, v = 'tv', 'unigrams and bigrams', 'Tfidf Vectorizer'
# incode Tfidf Vectorizer
tv = TfidfVectorizer(**parms_t)
# 
X_ = tv.fit_transform(df.text_clean).toarray()
y_ = df.target
# 
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_, y_, test_size=t_size, random_state=seed)
# 
print('Tfidf Vectorizer shape: ', X_.shape, )
# 
end = timer()
# Log matrics
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm_, 0, 0, 0, 0, 0, start, end), 
                                        orient='index').transpose())

Tfidf Vectorizer shape:  (1591742, 1163)


### Base Model Performance 

In [14]:
from sklearn.dummy import DummyClassifier
# Count
dummy_clf_c = DummyClassifier(strategy="stratified")
# 
dummy_clf_c.fit(X_train_c, y_train_c)
# 
dummy_clf_c.predict(X_test_c)
# Tfidf
dummy_clf_f = DummyClassifier(strategy="stratified")
# 
dummy_clf_f.fit(X_train_t, y_train_t)
# 
dummy_clf_f.predict(X_test_t)
# 
print('Count Base Score: ', dummy_clf_c.score(X_test_c, y_test_c))
print('Tfidf Base Score: ' ,dummy_clf_c.score(X_test_t, y_test_t))


Count Base Score:  0.49988220475013273
Tfidf Base Score:  0.5013240186085083


### Binary Classification Models

In [15]:
# Import
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
# 
params_lrc = {'random_state' : seed,
              'n_jobs' : (-1),
              'multi_class' : 'ovr',
             }
# 
params_lgbm = {'objective': 'binary',
               'metric': 'auc',
               'n_jobs' : (-1),
               'random_state' : seed,
              }
# 

#### Logistic Regression with Count Vectorizer

In [16]:
start = timer()
m, c, v = 'lrc_c', 'Logistic Regression', 'Count Vectorizer'
filename = model_path + 'LogisticRegression_counter.pkl'
# 
# Logistic Regression
lrc_c = LogisticRegression(**params_lrc)
# 
%time lrc_c.fit(X_train_c, y_train_c)
# Predict test results
y_pred_ = lrc_c.predict(X_test_c)
# 
print('\n')
cm = confusion_matrix(y_test_c, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lrc_c, X_train_c, X_test_c, y_train_c, y_test_c, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save the model to disk
pickle.dump(lrc_c, open(filename, 'wb'))
# 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 11.6 s, sys: 22.2 s, total: 33.8 s
Wall time: 4h 7min 53s


[[115709  43692]
 [ 31126 127822]]


Training set score:         0.7637846289
Test set score:             0.7649811999
Accuracy Test set Score:    0.7649811999
AUC Score:                  0.8394390391
ROC AUC Training set Score: 0.7638005041
ROC AUC Test set Score:     0.7650368920


In [17]:
start = timer()
m, c, v = 'lrc_t', 'Logistic Regression', 'Tfidf Vectorizer'
filename = model_path + 'LogisticRegression_Tfidf.pkl'
# 
# Logistic Regression
lrc_t = LogisticRegression(**params_lrc)
# 
%time lrc_t.fit(X_train_t, y_train_t)
# Predict test results
y_pred_ = lrc_t.predict(X_test_t)
# 
print('\n')
cm = confusion_matrix(y_test_t, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lrc_t, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save the model to disk
pickle.dump(lrc_c, open(filename, 'wb'))
# 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 16.4 s, sys: 41.3 s, total: 57.8 s
Wall time: 2h 30min 31s


[[117805  41596]
 [ 33141 125807]]


Training set score:         0.7646084123
Test set score:             0.7652356376
Accuracy Test set Score:    0.7652356376
AUC Score:                  0.8447820997
ROC AUC Training set Score: 0.7646191458
ROC AUC Test set Score:     0.7652729548


#### Light GBM Classifier with Count Vectorizer

In [18]:
start = timer()
m, c, v = 'lgbm_c', 'LGBM Classifier', 'Count Vectorizer'
filename = model_path + 'LGBMClassifier_counter.pkl'
# 
# Light GBM Classifier 
lgbm_c = LGBMClassifier(**params_lgbm)
# Train LGBM Classifier
%time lgbm_c.fit(X_train_c, y_train_c)
# Predict the response for test dataset
y_pred_ = lgbm_c.predict(X_test_c)
# 
print('\n')
cm = confusion_matrix(y_test_c, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lgbm_c, X_train_c, X_test_c, y_train_c, y_test_c, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(lgbm_c, filename)
# load model
# gbm_pickle = joblib.load(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 1min 45s, sys: 9.13 s, total: 1min 54s
Wall time: 19.9 s


[[112406  46995]
 [ 32518 126430]]


Training set score:         0.7494371337
Test set score:             0.7502332346
Accuracy Test set Score:    0.7502332346
AUC Score:                  0.8298985699
ROC AUC Training set Score: 0.7494558184
ROC AUC Test set Score:     0.7502974387


#### Light GBM Classifier with Tfidf Vectorizer

In [19]:
start = timer()
m, c, v = 'lgbm_t', 'LGBM Classifier', 'Tfidf Vectorizer'
filename = model_path + 'LGBMClassifier_tfidf.pkl'
# 
# LGBM Classifier
lgbm_t = LGBMClassifier(**params_lgbm)
# Train LGBM Classifier
%time lgbm_t.fit(X_train_t, y_train_t)
# Predict the response for test dataset
y_pred_ = lgbm_t.predict(X_test_t)
# 
print('\n')
cm = confusion_matrix(y_test_t, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lgbm_t, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(lgbm_t, filename)
# load model
# gbm_pickle = joblib.load(model_path + 'LGBMClassifier_tfidf.pkl')
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 5min 4s, sys: 10.7 s, total: 5min 15s
Wall time: 22.8 s


[[112899  46502]
 [ 32720 126228]]


Training set score:         0.7510909829
Test set score:             0.7511473257
Accuracy Test set Score:    0.7511473257
AUC Score:                  0.8319207025
ROC AUC Training set Score: 0.7511088300
ROC AUC Test set Score:     0.7512084252


## Bayesian Optimization:

#### Bayesian Optimization Tfidf Vectorizer

In [20]:
start = timer()
m, c, v = 'lgbBO', 'Bayesian Optimization', 'Tfidf Vectorizer'
# import
from bayes_opt import BayesianOptimization
import lightgbm
# 
# categorical_features = ['text_clean']
# 
def lgb_eval(num_leaves, max_depth, lambda_l2, lambda_l1, min_child_samples, min_data_in_leaf, \
             learning_rate,subsample_freq):
    params = {
        "objective" : "binary",
        "metric" : "auc", 
        'is_unbalance': True,
        "num_leaves" : int(num_leaves),
        "max_depth" : int(max_depth),
        "lambda_l2" : lambda_l2,
        "lambda_l1" : lambda_l1,
        "num_threads" : 20,
        "min_child_samples" : int(min_child_samples),
        'min_data_in_leaf': int(min_data_in_leaf),
        "learning_rate" : learning_rate,
        "subsample_freq" : int(subsample_freq),
        "bagging_seed" : seed,
        "verbosity" : -1
    }
    # lgtrain = lightgbm.Dataset(X_train, y_train,categorical_feature=categorical_features)
    lgtrain = lightgbm.Dataset(X_train_t, y_train_t)
    cv_result = lightgbm.cv(params,
                       lgtrain,
                       1000,
                       early_stopping_rounds=100,
                       stratified=True,
                       nfold=5)
    return cv_result['auc-mean'][-1]
# 
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (25, 4000),
                                        'max_depth': (5, 63),
                                        'lambda_l2': (0.0, 0.05),
                                        'lambda_l1': (0.0, 0.05),
                                        'min_child_samples': (50, 10000),
                                        'min_data_in_leaf': (100, 2000),
                                        'learning_rate': (0.001, 2.0),
                                        'subsample_freq': (1, 10),
                                        })

%time lgbBO.maximize(n_iter=10, init_points=2)
# 
end = timer()
# Log matrics
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm_, 0, 0, 0, 0, 0, start, end), 
                                        orient='index').transpose())

|   iter    |  target   | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_da... | num_le... | subsam... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8487  [0m | [0m 0.02615 [0m | [0m 0.03264 [0m | [0m 0.5577  [0m | [0m 40.94   [0m | [0m 2.327e+0[0m | [0m 834.8   [0m | [0m 3.034e+0[0m | [0m 6.38    [0m |
| [0m 2       [0m | [0m 0.8372  [0m | [0m 0.03934 [0m | [0m 0.04851 [0m | [0m 1.04    [0m | [0m 47.17   [0m | [0m 4.113e+0[0m | [0m 1.628e+0[0m | [0m 1.833e+0[0m | [0m 9.224   [0m |
| [0m 3       [0m | [0m 0.8348  [0m | [0m 0.01977 [0m | [0m 0.01642 [0m | [0m 1.284   [0m | [0m 57.26   [0m | [0m 8.875e+0[0m | [0m 1.376e+0[0m | [0m 3.64e+03[0m | [0m 9.401   [0m |
| [95m 4       [0m | [95m 0.8526  [0m | [95m 0.01326 [0m | [95m 0.04848 [0m | [95m 0.1248  [0m | [95m 40.8    [0m | [95m 2.824e+0[0m | 

In [21]:
start = timer()
m, c, v = 'lgbm_BO_t', 'LGBM Classifier - Bayesian Optimization', 'Tfidf Vectorizer'
filename = model_path + 'LGBMClassifier_BO_tfidf.pkl'
# 
# Capture Best Bayesian Optimization Values
params_BO = lgbBO.max
params_BO = params_BO['params']
# Round to INT keys
info = ['max_depth','min_child_samples', 'min_data_in_leaf', 'num_leaves', 'subsample_freq']
# 
params_ = {'objective': 'binary',
           'metric': 'auc',
           'n_jobs' : (-1),
           'random_state' : seed,
           'verbosity': (-1),
          }
# merge dictionaries in one line by simply using the unpacking operator (**)
params_BO = {**params_, **params_BO,}
# 
for k, v in params_BO.items():
    if k in info:
        params_BO[k] = int(v) 
# 
# LGBM Classifier
lgbm_BO_t = LGBMClassifier(**params_BO)
# Train LGBM Classifier
%time lgbm_BO_t.fit(X_train_t, y_train_t)
# Predict the response for test dataset
y_pred_ = lgbm_BO_t.predict(X_test_t)
# 
print('\n')
cm = confusion_matrix(y_test_t, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lgbm_BO_t, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(lgbm_BO_t, filename)
# load model
# gbm_pickle = joblib.load(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 4min 25s, sys: 2min 58s, total: 7min 23s
Wall time: 29.4 s


[[110369  49032]
 [ 29867 129081]]


Training set score:         0.7554337114
Test set score:             0.7521619355
Accuracy Test set Score:    0.7521619355
AUC Score:                  0.8377152912
ROC AUC Training set Score: 0.7554585357
ROC AUC Test set Score:     0.7522470982


#### Bayesian Optimization Count Vectorizer

In [22]:
start = timer()
m, c, v = 'lgbBO_c', 'Bayesian Optimization', 'Count Vectorizer'
# 
def lgb_eval_c(num_leaves, max_depth, lambda_l2, lambda_l1, min_child_samples, min_data_in_leaf, \
               learning_rate, subsample_freq):
    params = {
        "objective" : "binary",
        "metric" : "auc", 
        'is_unbalance': True,
        "num_leaves" : int(num_leaves),
        "max_depth" : int(max_depth),
        "lambda_l2" : lambda_l2,
        "lambda_l1" : lambda_l1,
        "num_threads" : 20,
        "min_child_samples" : int(min_child_samples),
        'min_data_in_leaf': int(min_data_in_leaf),
        "learning_rate" : learning_rate,
        "subsample_freq" : int(subsample_freq),
        "bagging_seed" : seed,
        "verbosity" : -1
    }
#     lgtrain = lightgbm.Dataset(X_train, y_train,categorical_feature=categorical_features)
    lgtrain_c = lightgbm.Dataset(X_train_c, y_train_c)
    cv_result = lightgbm.cv(params,
                       lgtrain_c,
                       1000,
                       early_stopping_rounds=100,
                       stratified=True,
                       nfold=5)
    return cv_result['auc-mean'][-1]
# 
lgbBO_c = BayesianOptimization(lgb_eval_c, {'num_leaves': (25, 4000),
                                        'max_depth': (5, 63),
                                        'lambda_l2': (0.0, 0.05),
                                        'lambda_l1': (0.0, 0.05),
                                        'min_child_samples': (50, 10000),
                                        'min_data_in_leaf': (100, 2000),
                                        'learning_rate': (0.001, 2.0),
                                        'subsample_freq': (1, 10),
                                        })

%time lgbBO_c.maximize(n_iter=10, init_points=2)
# 
end = timer()
# Log matrics
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm_, 0, 0, 0, 0, 0, start, end), 
                                        orient='index').transpose())

|   iter    |  target   | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_da... | num_le... | subsam... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8349  [0m | [0m 0.03221 [0m | [0m 0.04983 [0m | [0m 1.532   [0m | [0m 61.43   [0m | [0m 6.017e+0[0m | [0m 301.5   [0m | [0m 2.219e+0[0m | [0m 9.812   [0m |
| [95m 2       [0m | [95m 0.8443  [0m | [95m 0.01106 [0m | [95m 0.04557 [0m | [95m 1.349   [0m | [95m 34.5    [0m | [95m 8.324e+0[0m | [95m 489.5   [0m | [95m 1.768e+0[0m | [95m 5.909   [0m |
| [0m 3       [0m | [0m 0.8277  [0m | [0m 0.02556 [0m | [0m 0.000415[0m | [0m 1.719   [0m | [0m 16.4    [0m | [0m 78.76   [0m | [0m 1.437e+0[0m | [0m 430.7   [0m | [0m 4.749   [0m |
| [0m 4       [0m | [0m 0.8429  [0m | [0m 0.03596 [0m | [0m 0.02712 [0m | [0m 0.02196 [0m | [0m 28.45   [0m | [0m 2.76e+03[0m 

In [23]:
start = timer()
m, c, v = 'lgbm_BO_c', 'LGBM Classifier - Bayesian Optimization', 'Count Vectorizer'
filename = model_path + 'LGBMClassifier_BO_counter.pkl'
# 
# Capture Best Bayesian Optimization Values
params_BO_c = lgbBO_c.max
params_BO_c = params_BO_c['params']
# Round to INT keys
info = ['max_depth','min_child_samples', 'min_data_in_leaf', 'num_leaves', 'subsample_freq']
# 
params_ = {'objective': 'binary',
           'metric': 'auc',
           'n_jobs' : (-1),
           'random_state' : seed,
           'verbosity': (-1),
          }
# merge dictionaries in one line by simply using the unpacking operator (**)
params_BO_c = {**params_, **params_BO_c,}
# 
for k, v in params_BO_c.items():
    if k in info:
        params_BO_c[k] = int(v) 
# 
# LGBM Classifier
lgbm_BO_c = LGBMClassifier(**params_BO_c)
# Train LGBM Classifier
%time lgbm_BO_c.fit(X_train_c, y_train_c)
# Predict the response for test dataset
y_pred_ = lgbm_BO_c.predict(X_test_c)
# 
print('\n')
cm = confusion_matrix(y_test_c, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(lgbm_BO_c, X_train_c, X_test_c, y_train_c, y_test_c, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(lgbm_BO_c,filename)
# load model
# gbm_pickle = joblib.load(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 1min 31s, sys: 22.2 s, total: 1min 53s
Wall time: 11.1 s


[[117188  42213]
 [ 32999 125949]]


Training set score:         0.7632655433
Test set score:             0.7637435645
Accuracy Test set Score:    0.7637435645
AUC Score:                  0.8393701921
ROC AUC Training set Score: 0.7632774680
ROC AUC Test set Score:     0.7637842712


### Neural Network with Tfidf Vectorizer

In [24]:
from sklearn.neural_network import MLPClassifier
# 
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
# 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# 

#### Tuning hidden layer & learning rate for keras

In [25]:
start = timer()
m, c, v = 'lgbBO', 'Keras Tuning', 'Tfidf Vectorizer'
features = X_train_t.shape[1]
# 
def model_builder(hp):
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(features,1)))

    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(10))

    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.5, 0.1, 0.01, 0.001, or 0.00011
    hp_learning_rate = hp.Choice('learning_rate', values=[5e-1, 1e-1, 1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                metrics=['accuracy'])

    return model
# 
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='tmp',
                     project_name='keras_tuning')
# 
# Create a callback to stop training early after reaching a certain value for the validation loss.
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
# 
tuner.search(X_train_t, y_train_t, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
# 
layer = (best_hps.get('units'))
learning_rate = (best_hps.get('learning_rate'))
# 
end = timer()
# Log matrics
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm_, 0, 0, 0, 0, 0, start, end), 
                                        orient='index').transpose())

Trial 30 Complete [00h 05m 22s]
val_accuracy: 0.7528928518295288

Best val_accuracy So Far: 0.7759571671485901
Total elapsed time: 01h 08m 27s
INFO:tensorflow:Oracle triggered exit


#### Keras Classifier with Tfidf (_k_) & Counter (_c_) Vectorizer

In [26]:
start = timer()
m, c, v = 'model_k_', 'Keras Classifier Layer: ' + str(layer), 'Tfidf Vectorizer'
# 
filename = model_path + 'KerasClassifier_tfidf_model_k_.h5'
# Function to create model, required for KerasClassifier
def create_model_(optimizer='adam', init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(layer, input_dim=features, activation='relu', kernel_initializer=init))
    model.add(Dense(1, activation='sigmoid'))
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model
# 
# create model
model_k_ = KerasClassifier(build_fn=create_model_, verbose=1)

%time model_k_.fit(X_train_t, y_train_t, validation_split=0.20, epochs=15, batch_size=10)
# %time model_k_.fit(X_train_t, y_train_t)
# 
y_pred_ = model_k_.predict(X_test_t)
# 
cm = confusion_matrix(y_test_t, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(model_k_, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
model_k_.model.save(filename)
# 
# new_model = keras.models.load_model(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 1h 31min 46s, sys: 17min 19s, total: 1h 49min 5s
Wall time: 27min 58s
[[122055  37346]
 [ 32828 126120]]


Training set score:         0.7971372604
Test set score:             0.7795689702
Accuracy Test set Score:    0.7795689636
AUC Score:                  0.8779329158
ROC AUC Training set Score: 0.7971425082
ROC AUC Test set Score:     0.7795887120


In [27]:
start = timer()
m, c, v = 'model_c_', 'Keras Classifier Layer: ' + str(layer), 'Count Vectorizer'
# 
filename = model_path + 'KerasClassifier_count_model_c_.h5'
# 
# create model
model_c_ = KerasClassifier(build_fn=create_model_, verbose=1)

%time model_c_.fit(X_train_c, y_train_c, validation_split=0.20, epochs=15, batch_size=10)
# %time model_c_.fit(X_train_c, y_train_c)
# 
y_pred_ = model_c_.predict(X_test_c)
# 
cm = confusion_matrix(y_test_c, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(model_c_, X_train_c, X_test_c, y_train_c, y_test_c, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
model_c_.model.save(filename)
# 
# new_model = keras.models.load_model(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 1h 33min 3s, sys: 20min 5s, total: 1h 53min 8s
Wall time: 31min 21s
[[122294  37107]
 [ 34479 124469]]


Training set score:         0.8081558347
Test set score:             0.7751335502
Accuracy Test set Score:    0.7751335798
AUC Score:                  0.8882740422
ROC AUC Training set Score: 0.8081591827
ROC AUC Test set Score:     0.7751448712


#### MLP Classifier

In [29]:
# import sklearn.neural_network
# 
parms = {'hidden_layer_sizes': (100,),
         'activation': 'logistic',    # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’
         'solver': 'sgd',             # {‘lbfgs’, ‘sgd’, ‘adam’}
         'alpha': 0.0001,             # L2 penalty (regularization term) parameter
         'batch_size': 'auto',
         'learning_rate': 'constant', # ‘constant’ is a constant learning rate given by ‘learning_rate_init’.
                                      # {‘constant’, ‘invscaling’, ‘adaptive’}
         'learning_rate_init': 0.001, # The initial learning rate used. It controls the step-size in updating the 
                                      # weights. 
                                      # Only used when solver=’sgd’ or ‘adam’.
         'power_t': 0.5,              # The exponent for inverse scaling learning rate. It is used in updating  
                                      # effective learning rate when the learning_rate is set to ‘invscaling’. 
                                      # Only used when solver=’sgd’.
         'max_iter': 1000,
         'shuffle': True,
         'random_state': seed,
         'tol': 0.0001,
         'verbose': False,
         'warm_start': False,
         'momentum': 0.9,             # Momentum for gradient descent update. Should be between 0 and 1. 
                                      # Only used when solver=’sgd’.
         'nesterovs_momentum': True,
         'early_stopping': False,
         'validation_fraction': 0.1,  # The proportion of training data to set aside as 
                                      # validation set for early stopping. Must be between 0 and 1. 
                                      # Only used if early_stopping is True.
         'beta_1': 0.9,               # Exponential decay rate for estimates of first moment vector in adam, should 
                                      # be in [0, 1). Only used when solver=’adam’.
         'beta_2': 0.999,             # Exponential decay rate for estimates of second moment vector in adam, should 
                                      # be in [0, 1). Only used when solver=’adam’.
         'epsilon': 1e-08,            # Value for numerical stability in adam. Only used when solver=’adam’.
         'n_iter_no_change': 10,      # Maximum number of epochs to not meet tol improvement. Only effective 
                                      # when solver=’sgd’ or ‘adam’.
        }
####################################### 
start = timer()
m, c, v = 'mlp_tfidf_', 'MLP Classifier Layer: ' +  str(layer), 'Tfidf Vectorizer'
# 
filename = model_path + 'MLPClassifier_tfidf_.h5'
# 
parms = {'hidden_layer_sizes': (layer,),
         'activation': 'logistic',
         'solver': 'adam',
         'random_state': seed,
        }
# Create a model Tfidf 
mlp_tfidf_ = MLPClassifier(**parms)
# Train the model on the train data set
%time mlp_tfidf_.fit(X_train_t, y_train_t)
# Evaluate on test data
y_pred_ = mlp_tfidf_.predict(X_test_t)
# 
print('\n')
cm = confusion_matrix(y_test_t, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(mlp_tfidf_, X_train_t, X_test_t, y_train_t, y_test_t, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(mlp_tfidf_, filename)
# load model
# gbm_pickle = joblib.load(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 1d 1h 38min 18s, sys: 5h 15min 20s, total: 1d 6h 53min 39s
Wall time: 1d 7h 28min 7s


[[122958  36443]
 [ 34732 124216]]


Training set score:         0.7908579677
Test set score:             0.7764246158
Accuracy Test set Score:    0.7764246158
AUC Score:                  0.8753059930
ROC AUC Training set Score: 0.7908597772
ROC AUC Test set Score:     0.7764318109


In [31]:
start = timer()
m, c, v = 'mlp_count_', 'MLP Classifier Layer: ' +  str(layer), 'Count Vectorizer'
# 
filename = model_path + 'MLPClassifier_count_.h5'
# 
parms = {'hidden_layer_sizes': (layer,),
         'activation': 'logistic',
         'solver': 'adam',
         'random_state': seed,
        }
# Create a model Tfidf 
mlp_count_ = MLPClassifier(**parms)
# Train the model on the train data set
%time mlp_count_.fit(X_train_c, y_train_c)
# Evaluate on test data
y_pred_ = mlp_count_.predict(X_test_c)
# 
print('\n')
cm = confusion_matrix(y_test_c, y_pred_)
print(cm)
print('\n')
# 
training_score,test_score,accuracy_score,auc_score,roc_auc_train,roc_auc_test = \
                    scoring_model(mlp_count_, X_train_c, X_test_c, y_train_c, y_test_c, y_pred_)
# print the scores on training and test set
print('Training set score:         {:.10f}'.format(training_score))
print('Test set score:             {:.10f}'.format(test_score))
print('Accuracy Test set Score:    {:.10f}'.format(accuracy_score))
print('AUC Score:                  {:.10f}'.format(auc_score))
print('ROC AUC Training set Score: {:.10f}'.format(roc_auc_train))
print('ROC AUC Test set Score:     {:.10f}'.format(roc_auc_test))
# 
# save model
joblib.dump(mlp_count_, filename)
# load model
# gbm_pickle = joblib.load(filename)
# 
end = timer()
# Log matrics 
log = log.append(pd.DataFrame.from_dict(logging_metrics(m, c, v, cm, training_score, test_score, 
                                            auc_score, roc_auc_train, roc_auc_test, start, end), 
                                        orient='index').transpose())

CPU times: user 18h 13min 51s, sys: 3h 30min 40s, total: 21h 44min 31s
Wall time: 21h 25min 11s


[[120470  38931]
 [ 34737 124211]]


Training set score:         0.8032021536
Test set score:             0.7685935875
Accuracy Test set Score:    0.7685935875
AUC Score:                  0.8875247872
ROC AUC Training set Score: 0.8032074945
ROC AUC Test set Score:     0.7686118655


In [32]:
log.dropna(inplace=True)
log

Unnamed: 0,Model,Classifier,Vectorizer,TP,FN,FP,TN,Training_Score,Test_Score,AUC_Score,ROC_AUC_Training,ROC_AUC_Test,Duration_Mins
0,cv,unigrams and bigrams,Count Vectorizer,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.669744
0,tv,unigrams and bigrams,Tfidf Vectorizer,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.045181
0,lrc_c,Logistic Regression,Count Vectorizer,115709,43692,31126,127822,0.763785,0.764981,0.839439,0.763801,0.765037,221.821038
0,lrc_t,Logistic Regression,Tfidf Vectorizer,117805,41596,33141,125807,0.764608,0.765236,0.844782,0.764619,0.765273,150.749775
0,lgbm_c,LGBM Classifier,Count Vectorizer,112406,46995,32518,126430,0.749437,0.750233,0.829899,0.749456,0.750297,0.782862
0,lgbm_t,LGBM Classifier,Tfidf Vectorizer,112899,46502,32720,126228,0.751091,0.751147,0.831921,0.751109,0.751208,0.589967
0,lgbBO,Bayesian Optimization,Tfidf Vectorizer,0,0,0,0,0.0,0.0,0.0,0.0,0.0,76.871454
0,lgbm_BO_t,LGBM Classifier - Bayesian Optimization,3.523413,110369,49032,29867,129081,0.755434,0.752162,0.837715,0.755459,0.752247,0.626383
0,lgbBO_c,Bayesian Optimization,Count Vectorizer,0,0,0,0,0.0,0.0,0.0,0.0,0.0,90.425354
0,lgbm_BO_c,LGBM Classifier - Bayesian Optimization,5.776941,117188,42213,32999,125949,0.763266,0.763744,0.83937,0.763277,0.763784,0.533199
