In [None]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
# load dataset
data = pd.read_csv("dataset.csv")
# data.Target = data.Target.replace({'Dropout':0,'Graduate':2,"Enrolled":2})

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
for i in data.columns:
    print(i,end=",")

In [None]:
data.Target.value_counts()

In [None]:
data.head()

In [None]:
feature_cols = data.columns[:-1]
X = data[feature_cols] # Features
y = data.Target # Target variable

y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 70% training and 30% test

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=2)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))



## Optiizing Decision Tree Performance using ID3

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## Optiizing Decision Tree Performance using Gini-Impurity 

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="gini", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred,labels=["Dropout", "Enrolled", "Graduate"])

In [None]:
import seaborn as sns
sns.heatmap(cf_matrix, annot=True,cmap='Blues')

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(pd.DataFrame(classification_report(y_test, y_pred,output_dict=True)).iloc[:-1,:].T, annot=True,cmap='Blues')


In [None]:
print(classification_report(y_test, y_pred,labels=["Dropout", "Enrolled", "Graduate"]))

In [None]:
from sklearn import tree
import graphviz 

dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=feature_cols,  
                     class_names=data.Target,  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("Result") 
graph 

In [None]:
from sklearn.model_selection import cross_val_score
fold = 10
scores = cross_val_score(clf, X, y, cv=fold)
print("Scores :",scores)
avg_scores = scores.sum()/fold
print("Average Accuracy :",avg_scores)

In [None]:
import seaborn as sns
sns.lineplot(range(fold),scores)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc 

In [None]:
from sklearn.metrics import roc_auc_score
def find_roc_auc_score(data):
    data.Target = data.Target.replace({"Graduate":1,"Enrolled":2,"Dropout":3})
    y = data.Target # Target variable
    x = data.columns[:-1]
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 70% training and 30% test
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier()

    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)

    #Predict the response for test dataset
    y_pred = clf.predict_proba(X_test)
    return roc_auc_score(y_test,y_pred,multi_class="ovr")
find_roc_auc_score(data)

In [None]:
import numpy as np

FP = cf_matrix.sum(axis=0) - np.diag(cf_matrix) 
FN = cf_matrix.sum(axis=1) - np.diag(cf_matrix)
TP = np.diag(cf_matrix)
TN = cf_matrix.sum() - (FP + FN + TP)
print(TP)
print(TN)
print(FP)
print(FN)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
print("TPR :",TPR)
# Specificity or true negative rate
TNR = TN/(TN+FP)
print("TNR :",TNR)
# Fall out or false positive rate
FPR = FP/(FP+TN)
print("FPR :",FPR)
# False negative rate
FNR = FN/(TP+FN)
print("FNR :",FNR)
# Overall accuracy for each class
ACC = (TP+TN)/(TP+FP+FN+TN)
print("ACC:",ACC)


In [None]:
import matplotlib.pyplot as plt
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - DecisionTree')
plt.plot(FPR, TPR ,marker = 'o')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# metrics.plot_roc_curve(clf, X_test, y_test) 