Load Environment

In [1]:
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
import random
from pandas import read_csv, DataFrame,concat,get_dummies
import numpy as np
import matplotlib.pyplot as plt


Load Data

In [2]:
data=read_csv('Pokemon.csv')
y=data['Legendary']
X = data.iloc[:,1:-1]

In [3]:
len(y)

800

Preprocessing

In [4]:
print(X.columns.tolist())

['Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']


In [5]:
X_labels =['Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']
continuous_labels = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']
discrete_labels = ['Type 1', 'Type 2']
X_discrete = X[discrete_labels]
X_continuous = X[continuous_labels]
# Scale Continuous Data
scaler = StandardScaler()
X_continuous=scaler.fit_transform(X_continuous)
X_continuous = DataFrame(data=X_continuous,columns=continuous_labels) 
# Change Discrete Data to One Hot Code
X_discrete = get_dummies(X_discrete)

In [6]:
# Resemble data
X = concat([X_discrete, X_continuous],axis =1)

Configuration

In [7]:
cv_fold=10
random_seed = 42

Training Decision Tree

In [8]:
criterions = ['gini', 'entropy', 'log_loss']
max_depth_list = range(1,30)
result_dict = dict()
best_accuracy = 0
best_depth = 0
for criteria in criterions:
    result_dict[criteria] = []
for max_depth in max_depth_list:
    for criteria in criterions:
        DT = DecisionTreeClassifier(max_depth=max_depth,criterion=criteria,random_state=42)
        cv_score = np.mean(cross_val_score(DT,X,y,scoring='roc_auc',n_jobs=-1,cv=cv_fold))
        result_dict[criteria].append(cv_score)
        if cv_score > best_accuracy:
            best_accuracy = cv_score
            best_depth = max_depth
            best_criteria = criteria

Plot

In [13]:
plt.title('AUC_ROC of Decision Tree: Range of max depth ['+str(min(max_depth_list))+', '+str(max(max_depth_list))+']')
color = ['red','skyblue','green']
count=0
for criteria in criterions:
    plt.plot(max_depth_list, result_dict[criteria], color=color[count], label=criteria)
    count+=1
show_max='ROC-AUC='+str(round(best_accuracy,3))+'\n depth='+str(best_depth) 
# move the text down a little bit
plt.annotate(show_max,xytext=(best_depth+2,best_accuracy-0.01),xy=(best_depth,best_accuracy))
plt.plot(best_depth,best_accuracy,'go') 
plt.legend() 
plt.xlabel('Max Depth')
plt.ylabel('AUC_ROC')
plt.show()
print(1)

1


Retrain model to get parameters

In [10]:
DT_model = DecisionTreeClassifier(max_depth=best_depth,criterion=best_criteria,random_state=random_seed)
DT_model.fit(X,y)
importance = dict()
count=0
for xlable in X.columns.tolist():
    importance[xlable] =DT_model.feature_importances_[count]
    count+=1
sort_dict = sorted(importance.items(), key=lambda item: -item[1])
count=0
for key,item in sort_dict :
    print(key,item)
    count+=1
    if count == 10:
        break

Total 0.9748525020726886
Sp. Atk 0.02514749792731145
Type 1_Bug 0.0
Type 1_Dark 0.0
Type 1_Dragon 0.0
Type 1_Electric 0.0
Type 1_Fairy 0.0
Type 1_Fighting 0.0
Type 1_Fire 0.0
Type 1_Flying 0.0


In [11]:
plot_tree(DT_model)
plt.show()

In [12]:
print(DT_model.tree_.feature)
print(DT_model.tree_.children_right)

[36 -2 36 36 -2 -2 40 -2 -2]
[ 2 -1  6  5 -1 -1  8 -1 -1]


Train Random Forest

In [58]:
criterions = ['gini', 'entropy', 'log_loss']
n_estimators_list = range(20,2000,20)
result_dict_RF = dict()
best_accuracy_RF = 0
for criteria in criterions:
    result_dict_RF[criteria] = []
for estimator in n_estimators_list:
    for criteria in criterions:
        RF = RandomForestClassifier(n_estimators=estimator,criterion=criteria,n_jobs=-1,random_state=42)
        cv_score = np.mean(cross_val_score(RF,X,y,scoring='roc_auc',n_jobs=-1,cv=cv_fold))
        result_dict_RF[criteria].append(cv_score)
        if cv_score > best_accuracy_RF:
            best_accuracy_RF = cv_score
            best_nestimator_RF = estimator
            best_criteria_RF = criteria



Plot Random Forest

In [67]:
plt.title('AUC_ROC of Random Forest: Range of n_estimators ['+str(min(n_estimators_list))+', '+str(max(n_estimators_list))+']')
color = ['red','skyblue','green']
count=0
for criteria in criterions:
    plt.plot(n_estimators_list, result_dict_RF[criteria], color=color[count], label=criteria)
    count+=1
show_max='AUC-ROC='+str(round(best_accuracy_RF,3))+'\n n_estimators='+str(best_nestimator_RF) 
# move the text down a little bit
plt.annotate(show_max,xytext=(best_nestimator_RF-600,best_accuracy_RF-0.0002),xy=(best_nestimator_RF,best_accuracy_RF))
plt.plot(best_nestimator_RF,best_accuracy_RF,'go') 
plt.legend() 
plt.xlabel('Max Depth')
plt.ylabel('AUC-ROC')
plt.show()

: 

Fit New Model

In [None]:
RF_new = RandomForestClassifier(n_estimators=best_nestimator_RF,criterion=best_criteria_RF,n_jobs=-1,random_state=42)
RF_new.fit(X,y)
RF_new.oob_decision_function_