In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV
from numpy import asarray
from sklearn.preprocessing import scale


from sklearn.tree import DecisionTreeClassifier
from sklearn import tree



In [3]:
data = pd.read_csv('features.csv')
# Delete unesessary features from dataset
data = data.dropna()
data = data.drop(['enterworld_num','buyitemnowmainauction_num','completechallengeweek_num'],axis=1)
data = data.drop(['actor_account_id'
                    ,'survival_time'
                    ],axis=1)
data = data.drop(columns=data.columns[0],axis=1)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Normalize data

cols=['spendmoney_num', 'money_max', 'average_money_spent_per_session', 'average_time_between_logins'] #Cols we want to normalize
X_train[cols] = scale(X_train[cols],with_mean=True)

# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

data


(995, 33) (995,)
(2985, 33) (2985,)


Unnamed: 0,churn_yn,event_num,levelup_num,joinparty_num,spendmoney_num,duel_num,duel_kd,partybattle_num,completechallengetoday_num,itemupgrade_successrate,...,money_max,gathering_num,has_smurf_yn,average_money_spent_per_session,duels_per_session,partybattles_per_session,average_time_between_events,average_time_between_logins,reason_getmoney,reason_spendmoney
1,0.0,39385.0,5.0,176.0,1358.0,34.0,0.79,37.0,12.0,0.00,...,3.742728e+07,97.0,1.0,804582.92,0.18,0.19,83.21,16967.16,101.0,510.0
2,0.0,37423.0,19.0,148.0,907.0,10.0,2.33,0.0,12.0,0.00,...,1.733137e+07,6.0,1.0,396840.67,0.06,0.00,84.66,17974.76,101.0,510.0
3,0.0,33900.0,74.0,104.0,540.0,5.0,0.67,4.0,2.0,0.00,...,2.643581e+07,0.0,1.0,777688.27,0.05,0.04,93.17,30701.98,101.0,506.0
4,0.0,49063.0,0.0,273.0,470.0,0.0,0.00,0.0,36.0,0.00,...,7.976895e+07,0.0,0.0,68655.01,0.00,0.00,69.00,14263.94,101.0,560.0
5,1.0,4608.0,0.0,12.0,190.0,0.0,0.00,0.0,2.0,0.00,...,7.828748e+06,7.0,1.0,56009.32,0.00,0.00,750.82,123422.01,101.0,567.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,1.0,37206.0,39.0,54.0,2114.0,0.0,0.00,0.0,0.0,0.00,...,1.501839e+09,72.0,1.0,10688811.12,0.00,0.00,82.63,17466.62,101.0,522.0
4016,1.0,4932.0,5.0,0.0,44.0,0.0,0.00,0.0,0.0,0.00,...,4.740085e+06,0.0,1.0,10412.50,0.00,0.00,171.00,140017.91,101.0,506.0
4017,0.0,28439.0,106.0,4.0,934.0,0.0,0.00,0.0,0.0,0.00,...,1.507858e+06,0.0,1.0,85665.43,0.00,0.00,115.68,26950.02,101.0,522.0
4018,0.0,103938.0,6.0,410.0,3120.0,0.0,0.00,0.0,65.0,0.00,...,6.862637e+07,35.0,1.0,247444.82,0.00,0.00,33.30,3870.79,101.0,571.0


In [41]:
clf = DecisionTreeClassifier(criterion='gini', 
    splitter='best', 
    max_depth=4, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    min_weight_fraction_leaf=0.0, 
    max_features=20, 
    random_state=None, 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    class_weight=None, 
    ccp_alpha=0.0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [42]:
#F1-score
print("F1 Score")
mean_score = cross_val_score(clf, X, y, scoring="f1", cv = 7).mean()
std_score = cross_val_score(clf, X, y, scoring="f1", cv = 7).std()
print('Mean F1:', mean_score)
print('Std F1 score', std_score)
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
Mean F1: 0.54165642316591
Std F1 score 0.059381340000571575
CrossValidation Scores: 
[0.77386935 0.74371859 0.76130653 0.73366834 0.75251256]
0.75 accuracy with a standard deviation of 0.01


In [109]:
d_tree = clf
d_importances = d_tree.feature_importances_
d_tree_importances = pd.Series(d_importances, index=feature_names)
d_tree_feat_importances = pd.DataFrame(d_tree.feature_importances_, index=d_tree.feature_names_in_, columns=["Importance"])
d_tree_feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
d_tree_feat_importances.iloc[:10]
d_tree_feat_importances=d_tree_feat_importances.transpose()
weights=d_tree_feat_importances.to_dict()
d_tree_feat_importances
#weights=weights['Importance']
d_tree_feat_importances

Unnamed: 0,masteryexp,sessions_num,spendmoney_num,targetaccountid_num,trade_num,longest_time_between_events,average_time_between_events,partybattles_per_session,duels_per_session,average_money_spent_per_session,...,level_max,level_min,guildlevelup_num,itemupgrade_successrate,completechallengetoday_num,partybattle_num,duel_kd,duel_num,joinparty_num,reason_spendmoney
Importance,0.671505,0.15221,0.129744,0.027524,0.01095,0.008066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
param_grid = { 
    'max_features': ['sqrt','log2'],
    'max_depth' : range(4,25),
    'criterion' :['gini','entropy'],
    'min_samples_split' : range (2,10),
    'min_samples_leaf' : range (2,5),
    'min_weight_fraction_leaf' : (0.0,0.1,0.2,0.3),
    'class_weight':[{0: w} for w in [1, 2, 4, 6, 10]]
}

In [117]:
tree = DecisionTreeClassifier()
CV_rfc = GridSearchCV(estimator=tree,param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train,)



In [118]:
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)

{'class_weight': {0: 1}, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 9, 'min_weight_fraction_leaf': 0.0}
DecisionTreeClassifier(class_weight={0: 1}, criterion='entropy', max_depth=6,
                       max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=9)


In [77]:
clf = DecisionTreeClassifier(criterion='entropy',
    max_depth=6,
    splitter='best',
    min_samples_split=9, 
    min_samples_leaf=4,
    max_features='sqrt',
    min_weight_fraction_leaf=0.1,
    class_weight={0:1, 1:1 , 2:0.9 , 3:0.2, 4:0.2, 5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0,16:0,17:0,18:0,19:0}
    )
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [78]:
#F1-score
print("F1 Score")
mean_score = cross_val_score(clf, X, y, scoring="f1", cv = 10).mean()
std_score = cross_val_score(clf, X, y, scoring="f1", cv = 10).std()
print('Mean F1:', mean_score)
print('Std F1 score', std_score)
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
Mean F1: 0.5269539195330346
Std F1 score 0.09277170365601303
CrossValidation Scores: 
[0.73115578 0.75628141 0.74371859 0.74874372 0.74120603]
0.74 accuracy with a standard deviation of 0.01


In [48]:
filename = 'tree_model.sav'
pickle.dump(clf, open(filename, 'wb'))