In [196]:
#Libraries for NLTK
from nltk.corpus import stopwords, gazetteers
from nltk import sent_tokenize, wordpunct_tokenize
from nltk.stem.porter import PorterStemmer

#Libraries for Text Processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Libraries for Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

#Models
from sklearn.ensemble import RandomForestClassifier     # Random Forest
from sklearn.naive_bayes import MultinomialNB           # Nave Bayes
from sklearn.linear_model import LogisticRegression     # logistic regression
from sklearn.svm import SVC                             # Support Vector Classifier
from sklearn.neighbors import KNeighborsClassifier      # K Nearest Neighbors
from sklearn.tree import DecisionTreeClassifier         
from sklearn.ensemble import AdaBoostClassifier         # Ada Boost Classifier
from sklearn.ensemble import GradientBoostingClassifier # Gradient Boosting
from sklearn.ensemble import ExtraTreesClassifier       # Extra Trees Classifier
from sklearn.ensemble import BaggingClassifier          # Bagging Classifier
from sklearn.ensemble import VotingClassifier           # Ensemble Model
import tensorflow as TF                                 # Deep Neural Networks

# Other Libraries
import csv
import re
import string
import numpy as np
import pandas as pd
import os
import time
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')

# to make this notebook's output stable across runs
np.random.seed(123)


from sklearn.model_selection import train_test_split #training and testing data split
from scipy.stats import reciprocal, uniform

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [197]:
FILE_ROOT = "C:\\Users\\Owner\\Desktop\\TD Text Analytics\\"
TD_SOURCE_FILE = FILE_ROOT + "train.txt"
LABEL_FILE = FILE_ROOT + "labels_Candidate.csv"

In [198]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [199]:
#Run this module only once - NLTK Stopwords 

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('gazetteers')

##### Preprocessing data - removing non ascii, links and entities like @, # and &. Further Tokenize by excluding Stop words & apply Stemming

In [200]:
def strip_non_ascii(string):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in string if 0 < ord(c) < 127)
    return ''.join(stripped)

def pre_process(text):
            
        #Remove non-ascii characters
        text = strip_non_ascii(text)
                
        #Convert to lowercase
        text = text.lower()

        #Convert words with repeated characters
        text = re.sub(r'([a-z])\1{2,}',r'\1',text)
        
        return text

def tokenize(text):
    tokens = [word for sent in sent_tokenize(text) for word in wordpunct_tokenize(sent)]
         
    #Remove stopwords
    stop = stopwords.words("english")
    
    tokens = [token for token in tokens if token not in stop]

    #Remove all words less than 1 character
    tokens = [token for token in tokens if len(token) >= 1]
        
    #Apply stemming
    stemmer = PorterStemmer()
    tokens  = [stemmer.stem(token) for token in tokens]

    return tokens

##### Import the Data file

In [201]:
DataFile = pd.read_csv(TD_SOURCE_FILE, sep = None)

DataFile.columns = ["ID", "Wd","Text"]

First, I will merge all the words for every comment, to form complete sentences.

In [202]:
NewDataFile = DataFile.groupby('ID').apply(lambda x: x['Text'].values.tolist())

Now, I will replace my lists to Strings for text analysis. This will also eliminate the brakets "[]" and the comma separator from the entire text

In [203]:
FormatDataFile = pd.DataFrame(NewDataFile.apply(lambda x:' '.join(x)))
FormatDataFile.columns = ["Text"]

##### Partition the data to identify the initial training and test set.

In [204]:
#Now split Back The data to training and test set - before applying the pipeline
train_set, test_set = train_test_split(FormatDataFile, 
                                       test_size=0.28205,shuffle=False)

In [205]:
train_set.shape

(3998, 1)

In [206]:
test_set.shape

(1572, 1)

##### We can now add the labels as a new column to the training set.


In [207]:
Labels = pd.read_csv(LABEL_FILE)

In [208]:
# Extract the training labels
Extract_Train_Set = Labels[0:3999]
Extract_Train_Set.columns = ["ID", "Label"]
Extract_Train_Set = Extract_Train_Set.drop("ID",axis=1)

# Get the first column for test set
Extract_Text_Label = Labels[4000:]
Extract_Text_Label.columns = ["ID", "Label"]
Extract_Text_Label = Extract_Text_Label.drop("Label",axis=1)

Extract_Train_Set.shape

(3999, 1)

In [209]:
train_set['Label'] = Extract_Train_Set['Label']

In [210]:
train_set.shape

(3998, 2)

In [211]:
train_set.shape

(3998, 2)

With Training and Test Sets already defined and finalized, We also need a validation set to tune the model. Since we dont have test labels, Validation set approach will help improving over the accuracy.

In [212]:
#Now split Back The training data to training and validation set - before applying the pipeline
training_set, validation_set = train_test_split(train_set, 
                                       test_size=0.20,shuffle=True)

In [213]:
# Now define x and y.

#the Y Variable
train_set_y = training_set["Label"].copy()
validation_set_y = validation_set["Label"].copy()

#the X variables
train_set_X = training_set.drop("Label", axis=1)
validation_set_X = validation_set.drop("Label", axis=1)
test_set_X = test_set 

##### Build the TDM using the train set

In [214]:
cv = CountVectorizer(analyzer="word", ngram_range=(1,2), preprocessor=pre_process, tokenizer=tokenize)

In [215]:
x_train_tdm = cv.fit_transform(train_set_X["Text"])
x_validation_tdm = cv.transform(validation_set_X["Text"])
x_test_tdm = cv.transform(test_set_X["Text"])

x_train_tdm = x_train_tdm.toarray()
x_validation_tdm = x_validation_tdm.toarray()
x_test_tdm = x_test_tdm.toarray()

#Create the vocabulary and extract features in that Vocabulary.
vocab = cv.get_feature_names()

In [216]:
x_train_tdm.shape

(3198, 33886)

##### The Term Document Matrix has 33886 features. The next step is to reduce these dimensions and extract the best 100 features using f_classif test statistics. 

In [217]:
ch2 = SelectKBest(f_classif, k=100)
x_train_tdm = ch2.fit_transform(x_train_tdm, train_set_y)

In [218]:
x_val_tdm = ch2.transform(x_validation_tdm)
x_test_tdm = ch2.transform(x_test_tdm)

##### Build the first model - Random Forest Classifier

In [219]:
forest_class = RandomForestClassifier(random_state = 42)

n_estimators = [50, 100, 400, 700, 1000]
max_features = [5, 7, 10]
max_depth = [10, 20] 
oob_score = [True, False]
min_samples_split = [2, 4, 10, 12, 16]
min_samples_leaf = [1, 5, 10] 
max_leaf_nodes = [2, 10, 20]


param_grid_forest = {'n_estimators' : n_estimators, 'max_features' : max_features,
                     'max_depth' : max_depth, 'min_samples_split' : min_samples_split,
                    'oob_score' : oob_score, 'min_samples_leaf': min_samples_leaf, 
                     'max_leaf_nodes' : max_leaf_nodes}


rand_search_forest = RandomizedSearchCV(forest_class, param_grid_forest, cv = 4, scoring='roc_auc', refit = True,
                                 n_jobs = -1, verbose=2)

rand_search_forest.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   19.1s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 400, 700, 1000], 'max_features': [5, 7, 10], 'max_depth': [10, 20], 'min_samples_split': [2, 4, 10, 12, 16], 'oob_score': [True, False], 'min_samples_leaf': [1, 5, 10], 'max_leaf_nodes': [2, 10, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [220]:
# Now we will try searching the best estimator and predict the values on the training set

forest_best_params_ = rand_search_forest.best_params_
forest_best_estimators_ = rand_search_forest.best_estimator_

print(forest_best_params_)
print(forest_best_estimators_)

{'oob_score': True, 'n_estimators': 1000, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_leaf_nodes': 2, 'max_features': 5, 'max_depth': 20}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=5, max_leaf_nodes=2,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)


In [221]:
# Get the training accuracy with Random Forest

y_pred_forest = forest_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_forest)

0.868980612883052

In [222]:
# We can now predict the accuracy with validation Set

y_pred_forest_validation = forest_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_forest_validation)

0.855

As we notice, Random Forest has performed well on the validation set with 85.5% accuracy. The model generalizes well but making a final conclusion that lets build more models

##### Nave Bayes Classifier

In [223]:
N_Bayes_ = MultinomialNB()

alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 1.5, 2]

param_grid_Bayes = {'alpha':alpha}

grid_search_bayes = GridSearchCV(N_Bayes_, param_grid_Bayes, cv = 4, scoring='roc_auc', refit = True,
                                 n_jobs = -1, verbose=2)

grid_search_bayes.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    3.5s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 1.5, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [224]:
bayes_best_params_ = grid_search_bayes.best_params_
bayes_best_estimators_ = grid_search_bayes.best_estimator_

print(bayes_best_params_)
print(bayes_best_estimators_)

{'alpha': 0.001}
MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)


In [225]:
# Get the training accuracy with Naive Bayes

y_pred_bayes = bayes_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_bayes)

0.890556597873671

In [226]:
# We can now predict the accuracy with validation Set

y_pred_bayes_validation = bayes_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_bayes_validation)

0.835

The results shows a good case of overfitting with Validation accuracy going down to 83.37%

##### Logistic Regression

In [227]:
log_reg = LogisticRegression(random_state = 42)

C = np.array(list(range(1, 100)))/10
                            
param_grid_log_reg = {'C' : C}

rand_search_log_reg = RandomizedSearchCV(log_reg, param_grid_log_reg, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_log_reg.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    4.2s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'C': array([0.1, 0.2, ..., 9.8, 9.9])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [228]:
logreg_best_params_ = rand_search_log_reg.best_params_
logreg_best_estimators_ = rand_search_log_reg.best_estimator_

print(logreg_best_params_)
print(logreg_best_estimators_)

{'C': 0.7}
LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [229]:
# Get the training accuracy with Logistic Regression

y_pred_logreg = logreg_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_logreg)

0.8749218261413383

In [230]:
# We can now predict the accuracy with validation Set

y_pred_logreg_validation = logreg_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_logreg_validation)

0.85375

The Logistic Regression model performs well on the validation set with 85.37% accuracy

##### Support Vector Classifier

In [231]:
SVC_Classifier = SVC(random_state = 42)

param_distributions = {"gamma": reciprocal(0.0001, 0.001), "C": uniform(100000, 1000000)}

rand_search_svc = RandomizedSearchCV(SVC_Classifier, param_distributions, n_iter=10, verbose=2, n_jobs = -1)

rand_search_svc.fit(x_train_tdm, train_set_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    9.6s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000178E8552BA8>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000178E86D1048>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [232]:
svc_best_params_ = rand_search_svc.best_params_
svc_best_estimators_ = rand_search_svc.best_estimator_

print(svc_best_params_)
print(svc_best_estimators_)

{'C': 500752.9320304509, 'gamma': 0.0008216275767080257}
SVC(C=500752.9320304509, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0008216275767080257,
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)


In [233]:
y_pred_svc_estimator = svc_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_svc_estimator)

0.8949343339587242

In [234]:
# We can now predict the accuracy with validation Set

y_pred_svc_validation = svc_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_svc_validation)

0.83875

The Model accuracy drops from 89.4% to 83.8%, showing clear signs of overfitting

##### K - Nearest Neighbors Classifier

In [235]:
#Introduce KNN Classifier 

KNeighbours = KNeighborsClassifier()
leaf_size = list(range(1,15,2))
n_neighbors = list(range(4,20,2))

param_grid_KNeighbours = {'n_neighbors' : n_neighbors,'leaf_size':leaf_size}

grid_search_KNeighbours = GridSearchCV(KNeighbours, param_grid_KNeighbours, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

grid_search_KNeighbours.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 56 candidates, totalling 224 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 224 out of 224 | elapsed:  7.2min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [4, 6, 8, 10, 12, 14, 16, 18], 'leaf_size': [1, 3, 5, 7, 9, 11, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [236]:
knn_best_params_ = grid_search_KNeighbours.best_params_
knn_best_estimators_ = grid_search_KNeighbours.best_estimator_

print(knn_best_params_)
print(knn_best_estimators_)

{'leaf_size': 1, 'n_neighbors': 12}
KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=12, p=2,
           weights='uniform')


In [237]:
y_pred_knn_estimator = knn_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_knn_estimator)

0.8692933083176986

In [238]:
# We can now predict the accuracy with validation Set

y_pred_knn_validation = knn_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_knn_validation)

0.855

The KNN Classifier model performs well over the validation set with 85% accuracy

##### ADA Boost Classifier

In [239]:
ada_boost = AdaBoostClassifier(random_state = 42)

n_estimators = [50, 100, 400, 700, 1000]
learning_rate = [0.001, 0.01, 0.05, 0.09]
algorithm = ['SAMME', 'SAMME.R']

param_grid_ada = {'n_estimators' : n_estimators, 'learning_rate' : learning_rate, 'algorithm' : algorithm}

rand_search_ada = RandomizedSearchCV(ada_boost, param_grid_ada, cv = 4, scoring='roc_auc', refit = True, n_jobs = -1, verbose = 2)

rand_search_ada.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   57.7s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 400, 700, 1000], 'learning_rate': [0.001, 0.01, 0.05, 0.09], 'algorithm': ['SAMME', 'SAMME.R']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [240]:
ada_best_params_ = rand_search_ada.best_params_
ada_best_estimators_ = rand_search_ada.best_estimator_

print(ada_best_params_)
print(ada_best_estimators_)

{'n_estimators': 1000, 'learning_rate': 0.05, 'algorithm': 'SAMME.R'}
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=1000, random_state=42)


In [241]:
y_pred_ada_estimator = ada_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_ada_estimator)

0.8946216385240775

In [242]:
# We can now predict the accuracy with validation Set

y_pred_ada_validation = ada_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_ada_validation)

0.835

The ADA Classifier Model accuracy drops from 89.4% to 83.5%, showing clear signs of overfitting

##### Gradient Boosting Classifier

In [243]:
GB_Classifier = GradientBoostingClassifier(random_state = 42)

n_estimators = [50, 100, 400, 700, 1000]
learning_rate = [0.1, 0.5]
max_depth = [10, 20]
min_samples_split = [2, 4, 10, 12, 16]
min_samples_leaf = [1, 5, 10]
max_features = [5, 20]
max_leaf_nodes = [2, 10, 20]
                            
param_grid_grad_boost = {'n_estimators' : n_estimators, 'learning_rate' : learning_rate,
                              'max_depth' : max_depth, 'min_samples_split' : min_samples_split,
                              'min_samples_leaf' : min_samples_leaf, 'max_features' : max_features,
                              'max_leaf_nodes' : max_leaf_nodes}

rand_search_grad_boost = RandomizedSearchCV(GB_Classifier, param_grid_grad_boost, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_grad_boost.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   32.2s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 400, 700, 1000], 'learning_rate': [0.1, 0.5], 'max_depth': [10, 20], 'min_samples_split': [2, 4, 10, 12, 16], 'min_samples_leaf': [1, 5, 10], 'max_features': [5, 20], 'max_leaf_nodes': [2, 10, 20]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [244]:
gb_best_params_ = rand_search_grad_boost.best_params_
gb_best_estimators_ = rand_search_grad_boost.best_estimator_

print(gb_best_params_)
print(gb_best_estimators_)

{'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_leaf_nodes': 10, 'max_features': 20, 'max_depth': 10, 'learning_rate': 0.1}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=20, max_leaf_nodes=10,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


In [245]:
y_pred_gb_estimator = gb_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_gb_estimator)

0.8921200750469043

In [246]:
# We can now predict the accuracy with validation Set

y_pred_gb_validation = gb_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_gb_validation)

0.83625

The GB Classifier Model accuracy drops from 89.2% to 83.6%, showing brief signs of overfitting

##### Extra Trees Classifier

In [247]:
extra_classifier = ExtraTreesClassifier(random_state = 42)

n_estimators = [50, 100, 400, 700, 1000]
max_features = [5, 7, 10]
max_depth = [10, 20]
min_samples_split = [2, 4, 10, 12, 16]
min_samples_leaf = [1, 5, 10]  # Mhm, this one leads to accuracy of test and train sets being the same.

param_grid_extra_trees = {'n_estimators' : n_estimators, 'max_features' : max_features,
                         'max_depth' : max_depth, 'min_samples_split' : min_samples_split,
                         'min_samples_leaf' : min_samples_leaf}


rand_search_extra_trees = RandomizedSearchCV(extra_classifier, param_grid_extra_trees, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_extra_trees.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   24.5s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 400, 700, 1000], 'max_features': [5, 7, 10], 'max_depth': [10, 20], 'min_samples_split': [2, 4, 10, 12, 16], 'min_samples_leaf': [1, 5, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [248]:
et_best_params_ = rand_search_extra_trees.best_params_
et_best_estimators_ = rand_search_extra_trees.best_estimator_

print(et_best_params_)
print(et_best_estimators_)

{'n_estimators': 700, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 5, 'max_depth': 10}
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features=5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [249]:
y_pred_et_estimator = et_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_et_estimator)

0.8721075672295184

In [250]:
# We can now predict the accuracy with validation Set

y_pred_et_validation = et_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_et_validation)

0.855

The Extra Trees Classifier model performs well over the validation set with 85% accuracy

##### Bagging Classifier

In [251]:
Bag_Classifier = BaggingClassifier(DecisionTreeClassifier(random_state=42))

n_estimators = [50,70,100,200,500]
max_samples = [10,50,100]

param_grid_bag_clf = {'n_estimators':n_estimators, 'max_samples':max_samples}

rand_search_bag_clf = RandomizedSearchCV(Bag_Classifier, param_grid_bag_clf, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_bag_clf.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   34.8s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 70, 100, 200, 500], 'max_samples': [10, 50, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [252]:
bag_best_params_ = rand_search_bag_clf.best_params_
bag_best_estimators_ = rand_search_bag_clf.best_estimator_

print(bag_best_params_)
print(bag_best_estimators_)

{'n_estimators': 500, 'max_samples': 50}
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=50, n_estimators=500, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)


In [253]:
y_pred_bag_estimator = bag_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_bag_estimator)

0.868980612883052

In [254]:
# We can now predict the accuracy with validation Set

y_pred_bag_validation = bag_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_bag_validation)

0.855

The Bagging Classifier model performs well over the validation set with 85% accuracy

##### Deep Neural Networks

In [255]:
# Create Architecture for Neural Networks.

def reset_graph (seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Initialize the Input Layers and Hidden Layers
n_inputs = 100
n_hidden1 = 10
n_hidden2 = 50
n_hidden3 = 70
n_outputs = 2

reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

In [256]:
# Implement dropout

training = tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate = 0.5 
X_drop = tf.layers.dropout(X, dropout_rate, training=training) 

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, name="hidden1", 
                              activation=tf.nn.relu)             
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2",
                              activation=tf.nn.relu)
    hidden3 = tf.layers.dense(hidden2, n_hidden3, name="hidden3",
                              activation=tf.nn.relu)

    logit = tf.layers.dense(hidden3, n_outputs, name="outputs")

In [257]:
with  tf.name_scope ("loss"):
    xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(tf.cast(y, tf.int32), depth = 2), logits=logit)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)

In [258]:
learning_rate = 0.0001

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [259]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logit, tf.cast(y, tf.int64), 1) # tf.cast is new. 
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

In [260]:
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
saver = tf.train.Saver()

Now we need to define the directory to write the TensorBoard logs to:

In [261]:
from datetime import datetime

def log_dir(prefix=""):
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    root_logdir = "tf_logs"
    if prefix:
        prefix += "-"
    name = prefix + "run-" + now
    return "{}/{}/".format(root_logdir, name)

In [262]:
logdir = log_dir("TD_NN")

Now we can create the FileWriter that we will use to write the TensorBoard logs:

In [263]:
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [264]:
m_1, n_1 = x_train_tdm.shape

In [265]:
def random_batch(X_train, y_train, batch_size):
    rnd_indices = np.random.randint(0, len(X_train), batch_size)
    X_batch = X_train[rnd_indices]
    y_batch = y_train[rnd_indices]
    return X_batch, y_batch

In [266]:
#Run the first model on the training set.

n_epochs = 200
batch_size = 25
n_batches = int(np.ceil(m_1 / batch_size))
best_loss = np.infty
epochs_without_progress = 0
max_epochs_without_progress = 50

checkpoint_path = "TD_NN/tmp/train_dnn_reg_model.ckpt"
checkpoint_epoch_path = checkpoint_path + ".epoch"
final_model_path = "TD_NN/train_dnn_reg_model"

with tf.Session() as sess:
    if os.path.isfile(checkpoint_epoch_path):
        # if the checkpoint file exists, restore the model and load the epoch number
        with open(checkpoint_epoch_path, "rb") as f:
            start_epoch = int(f.read())
        print("Training was interrupted. Continuing at epoch", start_epoch)
        saver.restore(sess, checkpoint_path)
    else:
        start_epoch = 0
        sess.run(init)
        sess.run(init_l)
        for epoch in range(start_epoch, n_epochs):
            for iteration in range(batch_size):
                X_batch, y_batch = random_batch(x_train_tdm, np.array(train_set_y), batch_size)
                sess.run(training_op, feed_dict={X: X_batch, y: y_batch})          
            
            accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss,accuracy_summary, loss_summary], 
                                                              feed_dict={X: x_train_tdm, y: np.array(train_set_y)})
            
            file_writer.add_summary(accuracy_summary_str, epoch)
            file_writer.add_summary(loss_summary_str, epoch)
            if epoch % 5 == 0:
                print("Epoch:", epoch,
                      "\tTraining accuracy: {:.3f}%".format(accuracy_val * 100),
                      "\tLoss: {:.5f}".format(loss_val))
            saver.save(sess, checkpoint_path)
            with open(checkpoint_epoch_path, "wb") as f:
                f.write(b"%d" % (epoch + 1))
            if loss_val < best_loss:
                saver.save(sess, final_model_path)
                best_loss = loss_val
                os.remove(checkpoint_epoch_path)
            else:
                epochs_without_progress += 5
                if epochs_without_progress > max_epochs_without_progress:
                    print("Early stopping")
                    break

Epoch: 0 	Training accuracy: 87.523% 	Loss: 0.69177
Epoch: 5 	Training accuracy: 87.367% 	Loss: 0.68419
Epoch: 10 	Training accuracy: 87.086% 	Loss: 0.67690
Epoch: 15 	Training accuracy: 87.117% 	Loss: 0.66978
Epoch: 20 	Training accuracy: 87.054% 	Loss: 0.66286
Epoch: 25 	Training accuracy: 86.961% 	Loss: 0.65617
Epoch: 30 	Training accuracy: 86.992% 	Loss: 0.64948
Epoch: 35 	Training accuracy: 86.961% 	Loss: 0.64321
Epoch: 40 	Training accuracy: 86.929% 	Loss: 0.63686
Epoch: 45 	Training accuracy: 86.929% 	Loss: 0.63081
Epoch: 50 	Training accuracy: 86.929% 	Loss: 0.62488
Epoch: 55 	Training accuracy: 86.929% 	Loss: 0.61904
Epoch: 60 	Training accuracy: 86.929% 	Loss: 0.61357
Epoch: 65 	Training accuracy: 86.929% 	Loss: 0.60815
Epoch: 70 	Training accuracy: 86.898% 	Loss: 0.60282
Epoch: 75 	Training accuracy: 86.898% 	Loss: 0.59763
Epoch: 80 	Training accuracy: 86.898% 	Loss: 0.59254
Epoch: 85 	Training accuracy: 86.898% 	Loss: 0.58742
Epoch: 90 	Training accuracy: 86.898% 	Loss: 0.5

In [267]:
#Get the Training accuracy

with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_value = accuracy.eval(feed_dict={X: x_train_tdm, y: train_set_y})

print (accuracy_value)

INFO:tensorflow:Restoring parameters from TD_NN/train_dnn_reg_model
0.8689806


In [268]:
#Get the validation accuracy

with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_value = accuracy.eval(feed_dict={X: x_val_tdm, y: validation_set_y})

print (accuracy_value)

INFO:tensorflow:Restoring parameters from TD_NN/train_dnn_reg_model
0.855


##### Voting Classifier - Ensemble the Models, which have performed well on the validation models. These models are build on Random Forest, K-Nearest Neighbors, Extra Trees and Bagging Classification algorithms

In [269]:
total_estimators= [ ('rfo_clf', forest_best_estimators_), 
                    ('knn_clf', knn_best_estimators_),
                    ('etc_clf', et_best_estimators_),
                    ('bag_clf', bag_best_estimators_)
                  ]


voting_clf  = VotingClassifier(total_estimators)

voting_clf.fit(x_train_tdm, train_set_y)

VotingClassifier(estimators=[('rfo_clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=5, max_leaf_nodes=2,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_...stimators=500, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [270]:
#Predict the y_pred to get accuracy score.
y_pred_voting = voting_clf.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_voting)

0.868980612883052

In [271]:
# We can now predict the accuracy with validation Set

y_pred_voting_validation = voting_clf.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_voting_validation)

0.855

Voting Classifier doesn't impact much on the accuracy.

###### So I consider Deep Neural Networks as my best model with 85.5% accuracy. The Evaluation Criteria for selecting the best model is ROC_AUC.

In [272]:
with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    dnn_test_output = sess.run(tf.argmax(logit,1),feed_dict={X: x_test_tdm})

INFO:tensorflow:Restoring parameters from TD_NN/train_dnn_reg_model


In [273]:
result_test  = pd.DataFrame()
result_test["Labels"] = dnn_test_output

In [274]:
result_test["Comment_no"] = pd.DataFrame(list(range(4000,5572,1)))

In [275]:
result_test.to_csv("output_labels.csv")