# Advanced Classification

In this notebook, we will cover:

* Decision Tree Classifier
* Random Forest Classifier
* AdaBoost
* Hyperparameter Tuning by GridSearchCV
* Tuning by Randomize Search
* XGBoost

Importing all necessary packages

In [3]:
#!pip install pydotplus

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

### Task 1: Load Data 'titanic_clean.csv'

In [None]:
# write code here
data = None
df = None
df.head()

### Task 2: Do One Hot encoding for categorical varaibles and store in df

In [None]:
# write code here
df_OneHot = None
df_OneHot.head()

In [None]:
# Write code here to copy into df
df = None

### Task 3: Create dependent and independent varaibles

In [None]:
# write code here
X = None
Y = None

### Task 4: Split the data into training and testing set and set the random state to 100

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Write code here

print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

## Defining Decision Tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf_dt = DecisionTreeClassifier(
    max_depth=3, criterion='gini', random_state=100)

In [None]:
clf_dt.fit(xtrain, ytrain)

In [None]:
dt_pred = clf_dt.predict(xtest)
dt_pred_prb = clf_dt.predict_proba(xtest)[:, 1]

### Task 5: Find accuracy for Decision Tree

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# write code here
accuracy_dt = None
print("Accuracy: {}".format(accuracy_dt))

### Task 6: Find precision for Decision Tree

In [None]:
from sklearn.metrics import precision_score

In [None]:
# write code here
precision_dt = None
print("Precision: {}".format(precision_dt))

### Task 7: Find Recall for Decision Tree

In [None]:
from sklearn.metrics import recall_score

In [None]:
# write code here
recall_dt = None
print("Recall: {}".format(recall_dt))

### Task 8: Find F1 Score for Decision Tree

In [None]:
from sklearn.metrics import f1_score

In [None]:
# write code here
dt_f1 = None
print("F1 Score: {}".format(dt_f1))

### Task 9: Print Classification Report for Decision Tree

In [None]:
from sklearn.metrics import classification_report

In [None]:
# write code here

### ROC Curve<br>
Importing function to plot ROC AUC Curve

In [None]:
from plot_roc_curve import plot_roc_curve

In [None]:
from sklearn.metrics import auc, roc_curve, roc_auc_score

In [None]:
sns.set_context('poster')
auc_dt = roc_auc_score(ytest, dt_pred_prb)
fpr, tpr, threshold = roc_curve(ytest, dt_pred_prb)
plot_roc_curve(fpr, tpr, label='AUC = %0.3f' % auc_dt)

## Ploting Decision Tree

Loading packages

In [None]:
from io import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

### For Windows

Download Graphviz 2.38 from this [link](https://www.softpedia.com/get/Others/Miscellaneous/Graphviz.shtml) and install. Then run below line as per your intallation path. (This step is one time)

In [None]:
#import os
#os.environ["Path"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'

### For Ubuntu

In [None]:
# ! sudo apt install graphviz

### For MAC

In [None]:
# ! brew install graphviz

In [None]:
dot_data = StringIO()

In [None]:
export_graphviz(clf_dt, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=list(X.columns))

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [None]:
Image(graph.create_png())

Exporting PNG file of Decision Trees

In [None]:
graph.write_png('Decision Tree_Titanic.png')

## Features Importance

In [None]:
clf_dt.feature_importances_

In [None]:
features_tuple = list(zip(X.columns, clf_dt.feature_importances_))

In [None]:
features_tuple

In [None]:
feature_imp = pd.DataFrame(features_tuple, columns=[
                           "Feature Names", "Importance"])

In [None]:
feature_imp = feature_imp.sort_values("Importance", ascending=False)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x="Feature Names", y="Importance", data=feature_imp, color='b')
plt.xlabel("Titanic Features")
plt.ylabel("Importance")
plt.xticks(rotation=90)
plt.title("Decision Classifier - Features Importance")

# Random Forest

### Defining Random Forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_rf = RandomForestClassifier(random_state=100)

In [None]:
clf_rf.fit(xtrain, ytrain)

In [None]:
rf_pred = clf_rf.predict(xtest)
rf_pred_prb = clf_rf.predict_proba(xtest)[:, 1]

### Task 10: Find Precision for Random Forest

In [None]:
# write code here
precision_rf = None
print("Precision: {}".format(precision_rf))

### Task 11: Find Accuracy for Random Forest

In [None]:
# write code here
accuracy_rf = None
print("Accuracy: {}".format(accuracy_rf))

### Task 12: Find Recall for Random Forest

In [None]:
# write code here
recall_rf = None
print("Recall: {}".format(recall_rf))

### Task 13: Find F1 Score for Random Forest

In [None]:
# write code here
rf_f1 = None
print("F1 Score: {}".format(rf_f1))

### Task 14: Print Classification Report for Random Forest

In [None]:
from sklearn.metrics import classification_report

In [None]:
# write code here

### ROC Curve for Random Forest

In [None]:
from sklearn.metrics import auc, roc_curve, roc_auc_score

In [None]:
auc_rf = roc_auc_score(ytest, rf_pred_prb)
fpr, tpr, threshold = roc_curve(ytest, rf_pred_prb)
plot_roc_curve(fpr, tpr, label='AUC = %0.3f' % auc_rf)

### Features Importance

In [None]:
features_tuple = list(zip(X.columns, clf_rf.feature_importances_))

In [None]:
feature_imp = pd.DataFrame(features_tuple, columns=[
                           "Feature Names", "Importance"])

In [None]:
feature_imp = feature_imp.sort_values("Importance", ascending=False)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x="Feature Names", y="Importance", data=feature_imp, color='b')
plt.xlabel("Titanic Features")
plt.ylabel("Importance")
plt.xticks(rotation=90)
plt.title("Random Forest Classifier - Features Importance")

## Hyperparameter Tuning using GridSearchCV

Importing GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

Making grid of parameters and running GridSearch CV

In [None]:
param_grid1 = {"n_estimators": [9, 18, 27, 36, 45, 54, 63],
               "max_depth": [1, 5, 10, 15, 20, 25, 30],
               "min_samples_leaf": [1, 2, 4, 6, 8, 10]}

RF = RandomForestClassifier(random_state=100)
# Instantiate the GridSearchCV object: logreg_cv
RF_cv1 = GridSearchCV(RF, param_grid1, cv=5, scoring='accuracy', n_jobs=4)

# Fit it to the data
RF_cv1.fit(xtrain, ytrain)

# RF_cv1.cv_results_,
RF_cv1.best_params_, RF_cv1.best_score_

So we found **'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 18** as optimum parameters

Let's try new set of parameter values near to optimum.

In [None]:
param_grid2 = {"n_estimators": [9, 12, 15, 18, 21, 24, 27],
               "max_depth": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
               "min_samples_leaf": [1, 2, 3, 4]}

RF = RandomForestClassifier(random_state=100)
# Instantiate the GridSearchCV object: logreg_cv
RF_cv2 = GridSearchCV(RF, param_grid2, cv=5, scoring='accuracy', n_jobs=4)

# Fit it to the data
RF_cv2.fit(xtrain, ytrain)

# RF_cv2.grid_scores_,
RF_cv2.best_params_, RF_cv2.best_score_

 So we found **'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 15** as optimum parameters

### Tuned Random Forest

In [None]:
RF_grid = RF_cv2.best_estimator_

In [None]:
RF_grid.fit(xtrain, ytrain)

In [None]:
rf_pred_t = RF_grid.predict(xtest)
rf_pred_prb_t = RF_grid.predict_proba(xtest)[:, 1]

### Task 15: Find Accuracy for Tuned Random Forest

In [None]:
# write code here
accuracy_rf_t = None
print("Accuracy after tuning: {}".format(accuracy_rf_t))

### Task 16: Find Recall for Tuned Random Forest

In [None]:
# write code here
recall_rf_t = None
recall_rf_t

### Task 17: Find Precision for Tuned Random Forest

In [None]:
# write code here
precision_rf_t = None
precision_rf_t

### Task 18: Find F1 Score

In [None]:
# write code here
rf_t_f1 = None
rf_t_f1

### Task 19: Print Classification Report for Tuned Random Forest

In [None]:
# write code here

### ROC Curve for Tuned Random Forest

In [None]:
auc_rf_t = roc_auc_score(ytest, rf_pred_prb_t)
fpr, tpr, threshold = roc_curve(ytest, rf_pred_prb_t)
plot_roc_curve(fpr, tpr, label='AUC Tuned = %0.3f' % auc_rf_t)

### Hyperparameters Tuning Using Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf_rs = RandomForestClassifier(random_state=100)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

In [None]:
random_grid1 = {"n_estimators": n_estimators,
                "max_depth": max_depth,
                "min_samples_split": min_samples_split}

In [None]:
rf_random = RandomizedSearchCV(
    estimator=rf_rs, param_distributions=random_grid1, n_iter=50, cv=5, verbose=2, random_state=100)
# Fit the random search model
rf_random.fit(xtrain, ytrain)

**Selecting the best classifier**

In [None]:
clf_rf_rand = rf_random.best_estimator_

In [None]:
clf_rf_rand.fit(xtrain, ytrain)

In [None]:
pred_rf_rand = clf_rf_rand.predict(xtest)

In [None]:
prb_rf_rand = clf_rf_rand.predict_proba(xtest)[:, 1]

### Task 20: Find the Accuracy of Tuned Random Forest through Randomized Search

In [None]:
# write code here
accuracy_rf_r = None
accuracy_rf_r

### Task 21: Find the Precision of Tuned Random Forest through Randomized Search

In [None]:
# write code here
precision_rf_r = None
precision_rf_r

### Task 22: Find the Recall of Tuned Random Forest through Randomized Search

In [None]:
# write code here
recall_rf_r = None
recall_rf_r

### Task 23: Find the F1-Score of Tuned Random Forest through Randomized Search

In [None]:
# write code here
f1_rf_r = None
f1_rf_r

### Task 24: Print the Classification Report of Tuned Random Forest through Randomized Search

In [None]:
# write code here

### ROC Curve

In [None]:
auc_rf_r = roc_auc_score(ytest, prb_rf_rand)
fpr, tpr, threshold = roc_curve(ytest, prb_rf_rand)
plot_roc_curve(fpr, tpr, label='AUC Tuned Random = %0.3f' % auc_rf_r)

# Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
clf_adb = AdaBoostClassifier(random_state=100)
clf_adb.fit(xtrain, ytrain)

In [None]:
pred_clf_adb = clf_adb.predict(xtest)

In [None]:
adb_pred_prb = clf_adb.predict_proba(xtest)[:, 1]

### Task 25: Find Accuracy for Ada Boost

In [None]:
# write code here
accuracy_adb = None
accuracy_adb

### Task 26: Find Precision for Ada Boost

In [None]:
# write code here
precision_adb = None
precision_adb

### Task 27: Find Recall for Ada Boost

In [None]:
# write code here
recall_adb = None
recall_adb

### Task 28: Find F1 Score for Ada Boost

In [None]:
# write code here
f1_adb = None
f1_adb

### Task 29: Print Classification Report for Ada Boost

In [None]:
# write code here

### ROC Curve

In [None]:
auc_adb = roc_auc_score(ytest, adb_pred_prb)
fpr, tpr, threshold = roc_curve(ytest, adb_pred_prb)
plot_roc_curve(fpr, tpr, label='AUC = %0.3f' % auc_adb)

# XGBoost

## Extreme Gradient Boosting Algorithm

In [4]:
#!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-win_amd64.whl (89.1 MB)
     ---------------------------------------- 89.1/89.1 MB 9.1 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip


In [None]:
import xgboost as xgb

In [None]:
clf_xgb = xgb.XGBClassifier(seed=25,nthread=1,random_state=100)

In [None]:
clf_xgb.fit(xtrain, ytrain)

In [None]:
xgb_pred = clf_xgb.predict(xtest)
xgb_pred_prb=clf_xgb.predict_proba(xtest)[:,1]

### Task 30: Find Accuracy of XGBoost

In [None]:
#write code here 
accuracy_xgb = None
print("Accuracy: {}".format(accuracy_xgb))

In [None]:
### Task 31: Find  Recall of XGBoost

In [None]:
#write code here 
recall_xgb = None
recall_xgb

###  Task 32: Find  Precision of XGBoost

In [None]:
#write code here 
precision_xgb = None
precision_xgb

### Task 33: Find F1 Score XGB

In [None]:
#write code here 
xgb_f1=None
xgb_f1

### Task 34: Print Classification Report of XGBoost

In [None]:
#write code here 


### ROC Curve

In [None]:
auc_xgb=roc_auc_score(ytest,xgb_pred_prb)
fpr,tpr,threshold=roc_curve(ytest,xgb_pred_prb)
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_xgb)

# Comparison

In [None]:
comparison_dict={"Algorithm":["Decision Tree","Random Forest","Tuned Random Forest(Grid)","Tuned Random Forest(Random)","XGBoost","Ada Boost"],
                 "Accuracy":[accuracy_dt,accuracy_rf,accuracy_rf_t,accuracy_rf_r,accuracy_xgb,accuracy_adb],
                 "Precision":[precision_dt,precision_rf,precision_rf_t,precision_rf_r,precision_xgb,precision_adb],
                 "Recall":[recall_dt,recall_rf,recall_rf_t,recall_rf_r,recall_xgb,recall_adb],
                 "AUC":[auc_dt,auc_rf,auc_rf_t,auc_rf_r,auc_xgb,auc_adb],
                 "F1 Score":[dt_f1,rf_f1,rf_t_f1,f1_rf_r,xgb_f1,f1_adb]
                }

In [None]:
comparison = pd.DataFrame(comparison_dict)
comparison.sort_values(['Recall', 'Accuracy', 'AUC'], ascending=False)

# Submission on Kaggle

### Task 30: Import test data

In [None]:
# Write code here
test = None
df_test = None
df_test.head()

### Task 31: Do One Hot encoding of test data

In [None]:
# write code here
df_OneHot = None
df_OneHot.head()
df_test = None

### Task 32: Separate Passenger ID for submission

In [None]:
# Write code here

### Task 33: Do prediction through final model

In [None]:
# write code here
pred_final = None

#### Creating Data Frame for submission

In [None]:
submission = pd.DataFrame({'PassengerId': PassengerID, 'Survived': pred_final})

In [None]:
submission.head()

### Task 34: Export the dataset into csv file

In [None]:
# Write code here
submission.to_csv('my_submission v2.0.csv', index=False)