In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Scoring for classifiers
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc



from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder

# Classifiers from scikit-learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier

#%%
attrition = pd.read_csv('attrition.csv')
attrition.shape #(1470*35)
attrition.isnull().sum()#check for missing value, find none.
attrition.describe()
attrition.info()
attrition.corr()
attrition.hist(figsize = (16,20),bins = 50, xlabels=8, ylabels = 8)

#%% obtain data X and target y.
attrition = pd.read_csv('attrition.csv')

y_0 = attrition.iloc[:,1]
X_right= attrition.iloc[:,2:35]
X_left = attrition.iloc[:,0]
X_1 = pd.concat([X_left,X_right], axis = 1)

y = np.where(y_0 =='Yes',1,0)


# I don't know how to integrate onhotencoder into pandas dataframe.
#onehot_encoder = OneHotEncoder(sparse=False)
#onehot_encoded = onehot_encoder.fit_transform(X_1[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']])

X_2 = pd.concat([X_1,pd.get_dummies(X_1[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']])],axis=1)

X = X_2.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'],axis=1)

X.shape #1470*55

#need to use one_hot_encode, and normalization(put all data into same bell curve),

#%% feature selection: only select the feature that's useful.

#see zhihu post on feature engineering.


#%% decision tree, use default parameters, get a baseline.

DTg = DecisionTreeClassifier(criterion = 'entropy')


f1_cv= cross_val_score(DTg, X, y, scoring='f1_weighted', cv=10)
f1_avg = sum(f1_cv)/len(f1_cv)
print('f1',f1_avg)

acc_cv= cross_val_score(DTg, X, y, scoring='accuracy', cv=10)
acc_avg = sum(acc_cv)/len(acc_cv)
print('accuracy',acc_avg)

auc_cv= cross_val_score(DTg, X, y, scoring='roc_auc', cv=10)
auc_avg = sum(auc_cv)/len(auc_cv)
print('AUC',auc_avg)

#%% tune parameter of Decision tree: max leaf nodes

#need to add more parameters.
#add cartesian grid search to look for best parameter.

#stoch....gradient descent model to boost the data.(like adaboost)

leaf_li = [3,4,5,6,7,8,9,10,20,50,100,200,500]
DTig_li = [DecisionTreeClassifier(criterion = 'entropy', max_leaf_nodes = leaf_li[i]) for i in range(len(leaf_li))]

# Train the classifier using 10-fold training data, pruducing F measure,
#and take average as final output score.
f1_dtig_li =[]
for leaf, DTig in zip(leaf_li, DTig_li):    
    f1_cv_dtig= cross_val_score(DTig, X, y, scoring='f1_weighted', cv=10)
    f1_dtig = sum(f1_cv_dtig)/len(f1_cv_dtig)
    f1_dtig_li.append(f1_dtig)
    print('k',leaf,"f1", f1_dtig)

#find optimal parameter (max leaf nodes) from the traing result
def func(x):
    return x[1]

f1_DTig_sort = sorted(list(zip(leaf_li,f1_dtig_li)), key=func, reverse=True)
best_k_ig = f1_DTig_sort[0][0]
print('best k, info gain:',best_k_ig) #best k seems to be 7. with f1 score 0.8259.

#%% tune parameter: max depth

depth_li = np.linspace(1, 32, 32, endpoint=True)
DTdep_li = [DecisionTreeClassifier(criterion = 'entropy', max_depth = depth_li[i],max_leaf_nodes = 7) for i in range(len(leaf_li))]
# note: if not add max_leaf_node, the best depth is 2 with 0.824 f1 score; 
#if added max_leaf_node =7, the best depth is 4 with 0.8259 f1 score.


# Train the classifier using 10-fold cross validation, producing F measure (f1),
#and take average as final output score.
f1_dep_li =[]
for depth, DTdep in zip(depth_li, DTdep_li):    
    f1_cv_dep= cross_val_score(DTdep, X, y, scoring='f1_weighted', cv=10)
    f1_dep_avg = sum(f1_cv_dep)/len(f1_cv_dep)
    f1_dep_li.append(f1_dep_avg)
    print('max depth',depth,"f1", f1_dep_avg)

#find optimal parameter (max leaf nodes) from the traing result
def func(x):
    return x[1]

f1_dep_sort = sorted(list(zip(depth_li,f1_dep_li)), key=func, reverse=True)
best_dep_ig = f1_dep_sort[0][0]
print('best depth, info gain:',best_dep_ig) 
#with max_leaf_node =7, the best depth is 4, f1=0.8259

DTdep47 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, max_leaf_nodes = 7)

DTdep47.fit(X,y)

print(DTdep47.feature_importances_)

#%% random forest, basically all default parameters.
woods = RandomForestClassifier(n_estimators=100,random_state=0)
woods.fit(X, y)
#result: when 100 trees in forest: f1 = 0.815 accuracy = 0.856 AUC = 0.79
#when 200 trees in forest: f1 = 0.810 accuracy = 0.855 AUC = 0.80
#when 500 trees in forest: f1= 0.815 acc = 0.857 AUC = 0.81


woodf1_cv= cross_val_score(woods, X, y, scoring='f1_weighted', cv=5)
woodf1_avg = sum(woodf1_cv)/len(woodf1_cv)
print('f1',woodf1_avg)

wood_acc_cv= cross_val_score(woods, X, y, scoring='accuracy', cv=5)
wood_acc_avg = sum(wood_acc_cv)/len(wood_acc_cv)
print('accuracy',wood_acc_avg)

wood_auc_cv= cross_val_score(woods, X, y, scoring='roc_auc', cv=5)
wood_auc_avg = sum(wood_auc_cv)/len(wood_auc_cv)
print('AUC',wood_auc_avg)

#%% random forest, random grid search parameter tuning


from sklearn.model_selection import RandomizedSearchCV

#set the parameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]


# Number of features to consider at every split
max_features = [9,10,11,12,13,14,15] 


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 30, num=10)]


max_leaf_nodes = [100,200,500,1000,2000,3000]


# Minimum number of samples required to split a node
min_samples_split = [2,3,5,10,20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2]
# Method of selecting samples for training each tree
bootstrap = [False]

num_combo = len(n_estimators)*len(max_features)*len(max_depth) \
            *len(max_leaf_nodes)*len(min_samples_split)*len(min_samples_leaf)

print('tot combo:', num_combo)

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'max_leaf_nodes':max_leaf_nodes,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

#pp.pprint(random_grid)

# Use the random grid to search for best parameters

# originaly, there are 12960 types of parameter combination. 
#we first randomly select 100 of combination, to narrow down search scope.

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations.
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=0, n_jobs = -1)
#note: The most important arguments in RandomizedSearchCV are n_iter, 
#which controls the number of different combinations to try.

# Fit the random search model
rf_random.fit(X, y)

#gave out the best parameters among the 100 random combination. 
print(rf_random.best_params_)
print(rf_random.best_score_) 

#%% result from above, randam state = 42
#first run: 
#{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 
#'max_leaf_nodes': 70, 'max_features': 'sqrt', 'max_depth': 14, 'bootstrap': False}
#score = 0.8578231292517007
 

#second run
#'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 
#'max_leaf_nodes': 100, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': False 
#score = 0.861904761904762

#3rd run
#'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 
#'max_leaf_nodes': 150, 'max_features': 'auto', 'max_depth': 12, 'bootstrap': False
#score = 0.863265306122449

#4th run
#{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 
 #'max_leaf_nodes': 150, 'max_features': 'auto', 'max_depth': 13, 'bootstrap': True}
# score = 0.8585034013605443 with bootstrap true.


#5th run
#{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 
# 'max_leaf_nodes': 200, 'max_features': 'auto', 'max_depth': 11, 'bootstrap': False}
# score = 0.8612244897959184
 
#6th run : didn't restrict max_features, got lower score 
#{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 
#'max_leaf_nodes': None,'max_features': None, 'max_depth': 10, 'bootstrap': False}
#0.8204081632653061
 
#7th run
#'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 
#'max_leaf_nodes': 200, 'max_features': 'auto', 'max_depth': 14, 'bootstrap': False}
#score = 0.8646258503401361 
 
#8th run
#{'n_estimators': 350, 'min_samples_split': 3, 'min_samples_leaf': 1, 
# 'max_leaf_nodes': 500, 'max_features': 12, 'max_depth': 15, 'bootstrap': False}
# 0.8673469387755102 

#9th run 
#{'n_estimators': 250, 'min_samples_split': 3, 'min_samples_leaf': 1, 
#'max_leaf_nodes': 100, 'max_features': 12, 'max_depth': 10, 'bootstrap': False}
# 0.8666666666666667 
 
 #10th run
#{'n_estimators': 550, 'min_samples_split': 2, 'min_samples_leaf': 2, 
#'max_leaf_nodes': None, 'max_features': 15, 'max_depth': 11, 'bootstrap': False}
#0.8659863945578231

# !! good 11th run
#{'n_estimators': 550, 'min_samples_split': 5, 'min_samples_leaf': 2, 
#'max_leaf_nodes': 150, 'max_features': 12, 'max_depth': 10, 'bootstrap': False}
#0.8680272108843538

#set max leaf nodes to None, everythingelse same.
#{'n_estimators': 550, 'min_samples_split': 5, 'min_samples_leaf': 2, 
# 'max_leaf_nodes': None, 'max_features': 12, 'max_depth': 10, 'bootstrap': False}
#0.8625850340136054 
 
#12th run 
#{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 
#'max_leaf_nodes': None, 'max_features': 12, 'max_depth': 17, 'bootstrap': False}
# 0.8659863945578231 
 
# !!!!! good 13th run 
#'n_estimators': 350, 'min_samples_split': 5, 'min_samples_leaf': 2,
# 'max_leaf_nodes': 250, 'max_features': 10, 'max_depth': 16, 'bootstrap': False}
#0.8687074829931973
 
#14th expand range of some parameter
#{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 2, 
#'max_leaf_nodes': 2000, 'max_features': 13, 'max_depth': 15, 'bootstrap': False}
#0.8673469387755102 

#15th
#'n_estimators': 600, 'min_samples_split': 3, 'min_samples_leaf': 2, 
#'max_leaf_nodes': 500, 'max_features': 11, 'max_depth': 16, 'bootstrap': False}
#0.8673469387755102
 #%% narrow down the range of parameters using above result, and do a exhaustive grid search.
 
from sklearn.model_selection import GridSearchCV
 
narrow_grid = {'n_estimators': [1000],
               'max_features': [13],
               'max_depth': [12,13],
               'max_leaf_nodes':[2000],
               'min_samples_split': [2],
               'min_samples_leaf': [2],
               'bootstrap': [False]}

rf_narrow = GridSearchCV(estimator = rf, param_grid = narrow_grid, 
                          cv = 3, n_jobs = -1, verbose = 1)

rf_narrow.fit(X,y)

print(rf_narrow.best_params_)
#1st
#'bootstrap': False, 'max_depth': 13, 'max_features': 13, 'max_leaf_nodes': 2000, 
#'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000
#0.868027

#2ed
#{'bootstrap': False, 'max_depth': 12, 'max_features': 13, 'max_leaf_nodes': 2000, 
#'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1400}
#0.8666666666666667

#3rd
#'bootstrap': False, 'max_depth': 13, 'max_features': 13, 'max_leaf_nodes': 6000, 
#'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1400}
#0.8666666666666667

#4th
#{'bootstrap': False, 'max_depth': 12, 'max_features': 13, 'max_leaf_nodes': 10000, 
#'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1400}
#0.8646258503401361

#'bootstrap': False, 'max_depth': 12, 'max_features': 13, 'max_leaf_nodes': 8000, 
#'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1400}
#0.8659863945578231

#
#{'bootstrap': False, 'max_depth': 12, 'max_features': 13, 'max_leaf_nodes': 7000, 
 #'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1400}
#0.8659863945578231
