In [1036]:
#Libraries for NLTK
from nltk.corpus import stopwords, gazetteers
from nltk import sent_tokenize, wordpunct_tokenize
from nltk.stem.porter import PorterStemmer

#Libraries for Text Processing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Libraries for Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

#Models
from sklearn.ensemble import RandomForestClassifier     # Random Forest
from sklearn.naive_bayes import MultinomialNB           # Nave Bayes
from sklearn.linear_model import LogisticRegression     # logistic regression
from sklearn.tree import DecisionTreeClassifier         
from sklearn.ensemble import AdaBoostClassifier         # Ada Boost Classifier
from sklearn.ensemble import GradientBoostingClassifier # Gradient Boosting
from sklearn.ensemble import ExtraTreesClassifier       # Extra Trees Classifier
from sklearn.ensemble import BaggingClassifier          # Bagging Classifier
from sklearn.ensemble import VotingClassifier           # Ensemble Model
import tensorflow as TF                                 # Deep Neural Networks

# Other Libraries
import csv
import re
import string
import numpy as np
import pandas as pd
import os
import time
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')

# to make this notebook's output stable across runs
np.random.seed(123)


from sklearn.model_selection import train_test_split #training and testing data split
from scipy.stats import reciprocal, uniform

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [1037]:
FILE_ROOT = "C:\\Users\\Owner\\Desktop\\TD Text Analytics\\"
TD_SOURCE_FILE = FILE_ROOT + "train.txt"
LABEL_FILE = FILE_ROOT + "labels_Candidate.csv"

In [1038]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [1039]:
#Run this module only once - NLTK Stopwords 

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('gazetteers')

##### Preprocessing data - removing non ascii characters, Conver the words to Lowercase

In [1158]:
def strip_non_ascii(string):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in string if 0 < ord(c) < 127)
    return ''.join(stripped)

def pre_process(text):
            
        #Remove non-ascii characters
        text = strip_non_ascii(text)
                
        #Convert to lowercase
        text = text.lower()

        #Convert words with repeated characters
        text = re.sub(r'([a-z])\1{2,}',r'\1',text)
        
        return text

In [1243]:
def tokenize(text):
    tokens = [word for sent in sent_tokenize(text) for word in wordpunct_tokenize(sent)]
         
    #Remove stopwords
    stop = stopwords.words("english")
    tokens = [token for token in tokens if token not in stop]

    #Remove all words less than three chars
    tokens = [token for token in tokens if len(token) >= 3]
        
    #Apply stemming
    stemmer = PorterStemmer()
    tokens  = [stemmer.stem(token) for token in tokens]

    return tokens

##### Import the Data file

In [1244]:
DataFile = pd.read_csv(TD_SOURCE_FILE, sep = None)

DataFile.columns = ["ID", "Wd","Text"]

First, I will merge all the words for every comment, to form complete sentences.

In [1245]:
NewDataFile = DataFile.groupby('ID').apply(lambda x: x['Text'].values.tolist())

Now, I will replace my lists to Strings for text analysis. This will also eliminate the brakets "[]" and the comma separator from the entire text

In [1246]:
FormatDataFile = pd.DataFrame(NewDataFile.apply(lambda x:' '.join(x)))
FormatDataFile.columns = ["Text"]

##### Partition the data to identify the initial training and test set.

In [1247]:
#Now split Back The data to training and test set - before applying the pipeline
train_set, test_set = train_test_split(FormatDataFile, 
                                       test_size=0.28205,shuffle=False)

In [1248]:
train_set.shape

(3998, 1)

In [1249]:
test_set.shape

(1572, 1)

##### We can now add the labels as a new column to the training set.


In [1250]:
Labels = pd.read_csv(LABEL_FILE)

In [1251]:
# Extract the training labels
Extract_Train_Set = Labels[0:3999]
Extract_Train_Set.columns = ["ID", "Label"]
Extract_Train_Set = Extract_Train_Set.drop("ID",axis=1)

# Get the first column for test set
Extract_Text_Label = Labels[4000:]
Extract_Text_Label.columns = ["ID", "Label"]
Extract_Text_Label = Extract_Text_Label.drop("Label",axis=1)

Extract_Train_Set.shape

(3999, 1)

In [1252]:
train_set['Label'] = Extract_Train_Set['Label']

In [1253]:
train_set.shape

(3998, 2)

With Training and Test Sets already defined and finalized, We also need a validation set to tune the model. Since we dont have test labels, Validation set approach will help improving over the accuracy.

In [1254]:
#Now split Back The training data to training and validation set - before applying the pipeline
training_set, validation_set = train_test_split(train_set, test_size=0.20)

In [1288]:
# Now define x and y.

#the Y Variable
train_set_y = training_set["Label"].copy()
validation_set_y = validation_set["Label"].copy()

#the X variables
train_set_X = training_set.drop("Label", axis=1)
validation_set_X = validation_set.drop("Label", axis=1)
test_set_X = test_set 

##### Build the TDM using the train set

In [1289]:
cv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', preprocessor=pre_process, tokenizer=tokenize)

In [1290]:
cv.fit(train_set_X["Text"])
x_train_count = cv.transform(train_set_X["Text"])
x_validation_count = cv.transform(validation_set_X["Text"])
x_test_count = cv.transform(test_set_X["Text"])

In [1291]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',preprocessor=pre_process,tokenizer=tokenize)

tfidf_vect.fit(train_set_X["Text"])
xtrain_tfidf =  tfidf_vect.transform(train_set_X["Text"])
xvalid_tfidf =  tfidf_vect.transform(validation_set_X["Text"])
xtest_tfidf =   tfidf_vect.transform(test_set_X["Text"])

In [1292]:
x_train_tdm = x_train_count.toarray()
x_validation_tdm = x_validation_count.toarray()
x_test_tdm = x_test_count.toarray()

In [1293]:
x_train_tdm.shape

(3198, 6166)

##### The Term Document Matrix has 29544 features. The next step is to reduce these dimensions and extract the best 1000 features using chi2 test statistics. 

In [1294]:
ch2 = SelectKBest(chi2, k=1000)
x_train_tdm = ch2.fit_transform(x_train_tdm, train_set_y)

In [1295]:
x_val_tdm = ch2.transform(x_validation_tdm)
x_test_tdm = ch2.transform(x_test_tdm)

##### Build the first model - Random Forest Classifier

In [1300]:
forest_class = RandomForestClassifier(random_state = 42)

n_estimators = [5, 7, 10]
min_samples_split = [0.3,0.5,2]
min_samples_leaf = [0.1,0.5,1]

param_grid_forest = {'n_estimators' : n_estimators, 
                     'min_samples_split' : min_samples_split,
                     'min_samples_leaf': min_samples_leaf }


rand_search_forest = RandomizedSearchCV(forest_class, param_grid_forest, cv = 4, scoring='roc_auc', refit = True,
                                 n_jobs = -1, verbose=2)

rand_search_forest.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    8.5s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [5, 7, 10], 'min_samples_split': [0.3, 0.5, 2], 'min_samples_leaf': [0.1, 0.5, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [1301]:
# Now we will try searching the best estimator and predict the values on the training set

forest_best_params_ = rand_search_forest.best_params_
forest_best_estimators_ = rand_search_forest.best_estimator_

print(forest_best_params_)
print(forest_best_estimators_)

{'n_estimators': 5, 'min_samples_split': 0.3, 'min_samples_leaf': 1}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=0.3,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [1302]:
# Get the training accuracy with Random Forest

y_pred_forest = forest_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_forest)

0.9143214509068167

In [1303]:
# We can now predict the accuracy with validation Set

y_pred_forest_validation = forest_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_forest_validation)

0.87625

In [None]:
y_pred_forest_validation

As we notice, Random Forest fits well on the validation set with 87.6% accuracy. The model generalizes well. but before making a final conclusion that lets build more models

##### Nave Bayes Classifier

In [1304]:
N_Bayes_ = MultinomialNB()

alpha = [0.0001, 0.001, 1]

param_grid_Bayes = {'alpha':alpha}

grid_search_bayes = GridSearchCV(N_Bayes_, param_grid_Bayes, cv = 4, scoring='roc_auc', refit = True,
                                 n_jobs = -1, verbose=2)

grid_search_bayes.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.0s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': [0.0001, 0.001, 1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=2)

In [1305]:
bayes_best_params_ = grid_search_bayes.best_params_
bayes_best_estimators_ = grid_search_bayes.best_estimator_

print(bayes_best_params_)
print(bayes_best_estimators_)

{'alpha': 1}
MultinomialNB(alpha=1, class_prior=None, fit_prior=True)


In [1306]:
# Get the training accuracy with Naive Bayes

y_pred_bayes = bayes_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_bayes)

0.9036898061288305

In [1307]:
# We can now predict the accuracy with validation Set

y_pred_bayes_validation = bayes_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_bayes_validation)

0.8875

The results shows that Naive Bayes model fits well with a Validation accuracy rate of 88.75%

##### Logistic Regression

In [1308]:
log_reg = LogisticRegression(random_state = 42)
log_reg.fit(x_train_tdm, train_set_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [1310]:
log_reg = LogisticRegression(random_state = 42)

C = [0.0001, 0.001, 1]
                            
param_grid_log_reg = {'C' : C}

rand_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_log_reg.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.6s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.0001, 0.001, 1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=2)

In [1311]:
logreg_best_params_ = rand_search_log_reg.best_params_
logreg_best_estimators_ = rand_search_log_reg.best_estimator_

print(logreg_best_params_)
print(logreg_best_estimators_)

{'C': 1}
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [1312]:
# Get the training accuracy with Logistic Regression

y_pred_logreg = logreg_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_logreg)

0.8899312070043778

In [1313]:
# We can now predict the accuracy with validation Set

y_pred_logreg_validation = logreg_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_logreg_validation)

0.88875

The Logistic Regression model performs well on the validation set with 88.8% accuracy on the validation set

##### ADA Boost Classifier

In [1404]:
ada_boost = AdaBoostClassifier(random_state = 42)

n_estimators = [5, 50]
learning_rate = [0.001, 0.01, 1]
algorithm = ['SAMME', 'SAMME.R']

param_grid_ada = {'n_estimators' : n_estimators, 'learning_rate' : learning_rate, 'algorithm' : algorithm}

rand_search_ada = RandomizedSearchCV(ada_boost, param_grid_ada, cv = 4, scoring='roc_auc', refit = True, n_jobs = -1, verbose = 2)

rand_search_ada.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   31.8s finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [5, 50], 'learning_rate': [0.001, 0.01, 1], 'algorithm': ['SAMME', 'SAMME.R']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [1405]:
ada_best_params_ = rand_search_ada.best_params_
ada_best_estimators_ = rand_search_ada.best_estimator_

print(ada_best_params_)
print(ada_best_estimators_)

{'n_estimators': 50, 'learning_rate': 1, 'algorithm': 'SAMME.R'}
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=42)


In [1406]:
y_pred_ada_estimator = ada_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_ada_estimator)

0.8655409631019387

In [1407]:
# We can now predict the accuracy with validation Set

y_pred_ada_validation = ada_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_ada_validation)

0.88625

The ADA Classifier Model performs better with validation set as the accuracy improves to 88.6%, showing signs of underfitting

##### Gradient Boosting Classifier

In [1410]:
GB_Classifier = GradientBoostingClassifier(random_state = 42)

n_estimators = [50, 100]
learning_rate = [0.1, 0.5]
max_depth = [3,5]
min_samples_split = [2, 4]
min_samples_leaf = [1, 5]
                            
param_grid_grad_boost = {'n_estimators' : n_estimators, 'learning_rate' : learning_rate,
                              'max_depth' : max_depth, 'min_samples_split' : min_samples_split,
                              'min_samples_leaf' : min_samples_leaf}

rand_search_grad_boost = RandomizedSearchCV(GB_Classifier, param_grid_grad_boost, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_grad_boost.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  4.1min finished


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100], 'learning_rate': [0.1, 0.5], 'max_depth': [3, 5], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [1411]:
gb_best_params_ = rand_search_grad_boost.best_params_
gb_best_estimators_ = rand_search_grad_boost.best_estimator_

print(gb_best_params_)
print(gb_best_estimators_)

{'n_estimators': 100, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_depth': 5, 'learning_rate': 0.5}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


In [1412]:
y_pred_gb_estimator = gb_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_gb_estimator)

0.8883677298311444

In [1413]:
# We can now predict the accuracy with validation Set

y_pred_gb_validation = gb_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_gb_validation)

0.86625

The GB Classifier Model performs well on the validation set with 86.6% accuracy

##### Extra Trees Classifier

In [1432]:
extra_classifier = ExtraTreesClassifier(random_state = 42)

n_estimators = [2,10]
min_samples_split = [2, 4]
min_samples_leaf = [1, 2]  # Mhm, this one leads to accuracy of test and train sets being the same.

param_grid_extra_trees = {'n_estimators' : n_estimators,
                         'min_samples_split' : min_samples_split,
                         'min_samples_leaf' : min_samples_leaf}


rand_search_extra_trees = GridSearchCV(extra_classifier, param_grid_extra_trees, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_extra_trees.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   22.5s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [2, 10], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [1433]:
et_best_params_ = rand_search_extra_trees.best_params_
et_best_estimators_ = rand_search_extra_trees.best_estimator_

print(et_best_params_)
print(et_best_estimators_)

{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 10}
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [1434]:
y_pred_et_estimator = et_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_et_estimator)

0.8764853033145716

In [1435]:
# We can now predict the accuracy with validation Set

y_pred_et_validation = et_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_et_validation)

0.8825

The Extra Trees Classifier model performs well over the validation set with 88.25% accuracy

##### Bagging Classifier

In [1451]:
Bag_Classifier = BaggingClassifier(DecisionTreeClassifier(random_state=42))
Bag_Classifier.fit(x_train_tdm, train_set_y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [1452]:
y_pred_bag_estimator = Bag_Classifier.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_bag_estimator)

0.9480925578486554

In [1453]:
y_pred_bag_estimator = Bag_Classifier.predict(x_val_tdm)
accuracy_score(validation_set_y, y_pred_bag_estimator)

0.86125

In [1458]:
Bag_Classifier = BaggingClassifier(DecisionTreeClassifier(random_state=42))

n_estimators = [10, 12]
max_samples = [.4, 1]

param_grid_bag_clf = {'n_estimators':n_estimators, 'max_samples':max_samples}

rand_search_bag_clf = GridSearchCV(Bag_Classifier, param_grid_bag_clf, cv = 4, scoring='roc_auc', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_bag_clf.fit(x_train_tdm, train_set_y)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:   17.6s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 12], 'max_samples': [0.4, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [1459]:
bag_best_params_ = rand_search_bag_clf.best_params_
bag_best_estimators_ = rand_search_bag_clf.best_estimator_

print(bag_best_params_)
print(bag_best_estimators_)

{'max_samples': 0.4, 'n_estimators': 10}
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.4, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)


In [1460]:
y_pred_bag_estimator = bag_best_estimators_.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_bag_estimator)

0.8918073796122576

In [1461]:
# We can now predict the accuracy with validation Set

y_pred_bag_validation = bag_best_estimators_.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_bag_validation)

0.88125

The Bagging Classifier model performs well over the validation set with 88.12% accuracy

##### Deep Neural Networks

In [1615]:
# Create Architecture for Neural Networks.

def reset_graph (seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Initialize the Input Layers and Hidden Layers
n_inputs = 1000
n_hidden1 = 10
n_hidden2 = 150
n_hidden3 = 70
n_outputs = 2

reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

In [1616]:
# Implement dropout

training = tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate = 0.5 
X_drop = tf.layers.dropout(X, dropout_rate, training=training) 

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, name="hidden1", 
                              activation=tf.nn.relu)             
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2",
                              activation=tf.nn.relu)
    hidden3 = tf.layers.dense(hidden2, n_hidden3, name="hidden3",
                              activation=tf.nn.relu)

    logit = tf.layers.dense(hidden3, n_outputs, name="outputs")

In [1617]:
with  tf.name_scope ("loss"):
    xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(tf.cast(y, tf.int32), depth = 2), logits=logit)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)

In [1618]:
learning_rate = 0.001

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [1619]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logit, tf.cast(y, tf.int64), 1) # tf.cast is new. 
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

In [1620]:
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
saver = tf.train.Saver()

Now we need to define the directory to write the TensorBoard logs to:

In [1621]:
from datetime import datetime

def log_dir(prefix=""):
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    root_logdir = "tf_logs"
    if prefix:
        prefix += "-"
    name = prefix + "run-" + now
    return "{}/{}/".format(root_logdir, name)

In [1622]:
logdir = log_dir("TD_NN")

Now we can create the FileWriter that we will use to write the TensorBoard logs:

In [1623]:
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [1624]:
m_1, n_1 = x_train_tdm.shape

In [1625]:
def random_batch(X_train, y_train, batch_size):
    rnd_indices = np.random.randint(0, len(X_train), batch_size)
    X_batch = X_train[rnd_indices]
    y_batch = y_train[rnd_indices]
    return X_batch, y_batch

In [1626]:
#Run the first model on the training set.

n_epochs = 200
batch_size = 15
n_batches = int(np.ceil(m_1 / batch_size))
best_loss = np.infty
epochs_without_progress = 0
max_epochs_without_progress = 50

checkpoint_path = "TD_NN/tmp/train_dnn_reg_model.ckpt"
checkpoint_epoch_path = checkpoint_path + ".epoch"
final_model_path = "TD_NN/train_dnn_reg_model"

with tf.Session() as sess:
    if os.path.isfile(checkpoint_epoch_path):
        # if the checkpoint file exists, restore the model and load the epoch number
        with open(checkpoint_epoch_path, "rb") as f:
            start_epoch = int(f.read())
        print("Training was interrupted. Continuing at epoch", start_epoch)
        saver.restore(sess, checkpoint_path)
    else:
        start_epoch = 0
        sess.run(init)
        sess.run(init_l)
        for epoch in range(start_epoch, n_epochs):
            for iteration in range(batch_size):
                X_batch, y_batch = random_batch(x_train_tdm, np.array(train_set_y), batch_size)
                sess.run(training_op, feed_dict={X: X_batch, y: y_batch})          
            
            accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss,accuracy_summary, loss_summary], 
                                                              feed_dict={X: x_train_tdm, y: np.array(train_set_y)})
            
            file_writer.add_summary(accuracy_summary_str, epoch)
            file_writer.add_summary(loss_summary_str, epoch)
            if epoch % 5 == 0:
                print("Epoch:", epoch,
                      "\tTraining accuracy: {:.3f}%".format(accuracy_val * 100),
                      "\tLoss: {:.5f}".format(loss_val))
            saver.save(sess, checkpoint_path)
            with open(checkpoint_epoch_path, "wb") as f:
                f.write(b"%d" % (epoch + 1))
            if loss_val < best_loss:
                saver.save(sess, final_model_path)
                best_loss = loss_val
                os.remove(checkpoint_epoch_path)
            else:
                epochs_without_progress += 5
                if epochs_without_progress > max_epochs_without_progress:
                    print("Early stopping")
                    break

Epoch: 0 	Training accuracy: 85.366% 	Loss: 0.68542
Epoch: 5 	Training accuracy: 85.741% 	Loss: 0.64976
Epoch: 10 	Training accuracy: 85.741% 	Loss: 0.61382
Epoch: 15 	Training accuracy: 85.741% 	Loss: 0.58230
Epoch: 20 	Training accuracy: 85.741% 	Loss: 0.55491
Epoch: 25 	Training accuracy: 85.741% 	Loss: 0.53315
Epoch: 30 	Training accuracy: 85.741% 	Loss: 0.51316
Epoch: 35 	Training accuracy: 85.741% 	Loss: 0.49775
Epoch: 40 	Training accuracy: 85.741% 	Loss: 0.48358
Epoch: 45 	Training accuracy: 85.741% 	Loss: 0.47177
Epoch: 50 	Training accuracy: 85.741% 	Loss: 0.46088
Epoch: 55 	Training accuracy: 85.741% 	Loss: 0.45285
Epoch: 60 	Training accuracy: 85.741% 	Loss: 0.44597
Epoch: 65 	Training accuracy: 85.741% 	Loss: 0.44039
Epoch: 70 	Training accuracy: 85.741% 	Loss: 0.43593
Epoch: 75 	Training accuracy: 85.741% 	Loss: 0.43195
Epoch: 80 	Training accuracy: 85.741% 	Loss: 0.42917
Epoch: 85 	Training accuracy: 85.741% 	Loss: 0.42562
Epoch: 90 	Training accuracy: 85.741% 	Loss: 0.4

In [1627]:
#Get the Training accuracy

with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_value = accuracy.eval(feed_dict={X: x_train_tdm, y: train_set_y})

print (accuracy_value)

INFO:tensorflow:Restoring parameters from TD_NN/train_dnn_reg_model
0.8574109


In [1628]:
#Get the validation accuracy

with tf.Session() as sess:
    saver.restore(sess, final_model_path)
    accuracy_value = accuracy.eval(feed_dict={X: x_val_tdm, y: validation_set_y})

print (accuracy_value)

INFO:tensorflow:Restoring parameters from TD_NN/train_dnn_reg_model
0.90125


Thus Deep Nueral Network model is not well suited for this assignment.

##### Voting Classifier - Ensemble the Models, that performed well on the validation models. These models are build on Random Forest, K-Nearest Neighbors, Extra Trees and Bagging Classification algorithms

In [1568]:
total_estimators= [ ('nby_clf', bayes_best_estimators_ ),
                    ('log_clf', logreg_best_estimators_),
                    ('gbo_clf', gb_best_estimators_)
                  ]


voting_clf  = VotingClassifier(total_estimators)

voting_clf.fit(x_train_tdm, train_set_y)

VotingClassifier(estimators=[('nby_clf', MultinomialNB(alpha=1, class_prior=None, fit_prior=True)), ('log_clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='libline...        presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [1629]:
#Predict the y_pred to get accuracy score.
y_pred_voting = voting_clf.predict(x_train_tdm)
accuracy_score(train_set_y, y_pred_voting)

0.892432770481551

In [1630]:
# We can now predict the accuracy with validation Set

y_pred_voting_validation = voting_clf.predict(x_val_tdm)
accuracy_score(validation_set_y,y_pred_voting_validation)

0.88875

In [1631]:
pd.DataFrame(y_pred_voting_validation).to_csv("dnn.csv")

Voting Classifier doesn't impact much on the accuracy.

###### So I consider the Logistic Regression model as my best model with 88.87% accuracy. The Evaluation Criteria for selecting the best model is ROC_AUC.

In [1632]:
# We can now predict the accuracy with test Set

y_pred_logreg_test = logreg_best_estimators_.predict(x_test_tdm)

In [1633]:
result_test  = pd.DataFrame()
result_test["Labels"] = y_pred_logreg_test

In [1634]:
result_test["Comment_no"] = pd.DataFrame(list(range(4000,5572,1)))

In [1635]:
result_test.to_csv("Labels_logistic_regression.csv")