# Classification
As our dataset is now ready we shall use Ensemble Classifier algorithms to train the model

Since there are various different models, we shall try all of them one by one.

Let us load the dataset and split it for training and testing.


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = pd.read_csv('../new_type/t_dataset.csv',index_col=0)
y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Week Number']
x = dataset[features]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=50,shuffle=True)
dataset

## Bagging Algorithms

### 1. Bagged Decision Trees

In [None]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=70)
cart = DecisionTreeClassifier()
b_model = BaggingClassifier(base_estimator=cart, n_estimators=100, random_state=1)
b_model.fit(x_train,y_train)

# Find the mean prediction rate
b_results = model_selection.cross_val_score(b_model, x_train, y_train, cv=kfold,scoring='roc_auc')
b_scores = b_model.predict(x_test)

print(b_results.mean())

#Plot ROC Curve
plot_roc_curve(b_model, x_test, y_test,color ='darkorange',linewidth=3,label='EnsembleClassifier (auc=%.2f)'%roc_auc_score(y_test,y_pred_proba))
plt.xlabel('1 - Specificity',size=13)
plt.ylabel('Sensitivity',size=13)
plt.title('Receiver Operating Characteristic',size=15)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
# plt.legend(loc="lower right")
plt.show()

### 2. Random Forest

In [None]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix,plot_roc_curve
#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
r_model = RandomForestClassifier(n_estimators=100, max_features=4)
r_model.fit(x_train,y_train)

# Find the mean prediction rate
r_results = model_selection.cross_val_score(r_model, x_train, y_train, cv=kfold,scoring='roc_auc')
print(r_results.mean())

r_model.fit(x_train,y_train)
r_scores = r_model.predict_proba(x_test)[::,1]
# print("Accuracy:",accuracy_score(y_test,r_scores))
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, r_scores)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, r_scores))
# print(confusion_matrix(y_test,r_scores))
plot_roc_curve(r_model, x_test, y_test) 

### 3. Extra Trees

In [None]:
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix,plot_roc_curve,accuracy_score

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
e_model = ExtraTreesClassifier(n_estimators=100, max_features=4)

# Find the mean prediction rate
e_results = model_selection.cross_val_score(e_model, x_train, y_train, cv=kfold,scoring='roc_auc')
print(e_results.mean())

e_model.fit(x_train,y_train)
e_scores = e_model.predict_proba(x_test)[::,1]
# print("Accuracy:",accuracy_score(y_test,e_scores))
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, e_scores)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, e_scores))
# print(confusion_matrix(y_test,e_scores))
plot_roc_curve(e_model, x_test, y_test) 

## Boosting Algorithms

### 1. AdaBoost

In [None]:
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix,plot_roc_curve,accuracy_score


#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
ab_model = AdaBoostClassifier(n_estimators=100, random_state=0)

# Find the mean prediction rate
ab_results = model_selection.cross_val_score(ab_model, x_train, y_train, cv=kfold,scoring='roc_auc')
print(ab_results.mean())

ab_model.fit(x_train,y_train)
ab_scores = ab_model.predict_proba(x_test)[::,1]
# print("Accuracy:",accuracy_score(y_test,ab_scores))
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, ab_scores)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, ab_scores))
# print(confusion_matrix(y_test,ab_scores))
plot_roc_curve(ab_model, x_test, y_test) 

### 2. Stochastic Gradient Boosting

In [None]:
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier


#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=5)

# Find the mean prediction rate
gb_results = model_selection.cross_val_score(gb_model, x_train, y_train, cv=kfold,scoring='roc_auc')
print(gb_results.mean())

# Plot ROC Curve
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix,plot_roc_curve
import matplotlib.pyplot as plt

gb_model.fit(x_train,y_train)
gb_scores = gb_model.predict_proba(x_test)[::,1]
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, gb_scores)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, gb_scores))
plot_roc_curve(gb_model, x_test, y_test) 
plt.plot()

## Voting Ensemble

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


kfold = model_selection.KFold(n_splits=10)

# Create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# Create the ensemble model
ensemble = VotingClassifier(estimators)

# Find the mean prediction rate
ve_results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold,scoring='roc_auc')
print(ve_results.mean())

Since we have tried various different ensemble classifiers, let us summarize the prediction accuracy of each model.

## Summary

Here we compare the accuracy of various ensemble models

In [None]:
print("Mean Prediction Accuracy of various models:")
print("\nBagging Algorithms")
print("1. Bagged Decision Trees: ",b_results.mean())
print("2. Random Forest: ",r_results.mean())
print("3. Extra Trees: ",e_results.mean())
print('\nBoosting Algorithms')
print("1. AdaBoost: ",ab_results.mean())
print("2. Stochastic Gradient Boosting ",gb_results.mean())
print('\nVoting Ensemble:',ve_results.mean())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math

fig = plt.figure(figsize = (10, 6))
x = ['BaggedDecisonTree','RandomForest','ExtraTrees','AdaBoost','StochasticGradient','VotingEnsemble']
y = [b_results.mean()*100,r_results.mean()*100,e_results.mean()*100,ab_results.mean()*100,gb_results.mean()*100,ve_results.mean()*100]
low = min(y)-2
high = max(y)
plt.ylim([math.ceil(low-0.5*(high-low)), math.ceil(high+0.5*(high-low))])
# creating the bar plot
plt.bar(x, y, color =['red','blue','green','orange','cyan','maroon'],width = 0.5)
 
# for index, value in enumerate(y):
#      plt.text(value, index,
#               str(value))
    
plt.xlabel("Ensemble Classifier Models")
plt.ylabel("Mean Accuracy %")
plt.title("Accuracy of different ensemble models")
plt.show()

## Linear Regression

Let us find out the accuracy of LR on the same dataset

In [None]:
#LR
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
log_scores = logreg.score(x_test,y_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_test, y_test)))

## Support Vector Machine

Let us check how the svm classifier performs

In [None]:
from sklearn import svm

dataset = pd.read_csv('../new_type/t_dataset.csv')
y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Week Number']
X = dataset[features]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

model = svm.SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
sv_score = model.score(x_test,y_test)
print('Accuracy of support vector machine on test set: {:.2f}'.format(model.score(x_test, y_test)))

## Comparsion

Let us plot the different accuracy using matplot lib

In [None]:
import math
x= ['SVM', 'LR', 'EM']
y = [sv_score*100,log_scores*100,gb_results.mean()*100]
fig, ax = plt.subplots()
low = min(y)-2
high = max(y)
plt.xlim([math.ceil(low-0.5*(high-low)), math.ceil(high+0.2*(high-low))])
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the x locations for the groups
ax.barh(ind, y, width, color=["blue",'green','maroon'])
ax.set_yticks(ind)
ax.set_yticklabels(x, minor=False,Fontsize=12)
for i, v in enumerate(y):
    ax.text(v + 1, i , str(v)[:5], color='blue', fontweight='bold')
plt.xlabel('Accuracy')  
plt.show