In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV



from sklearn.tree import DecisionTreeClassifier
from sklearn import tree



In [4]:
data = pd.read_csv('features.csv')
# Delete unesessary features from dataset
data = data.dropna()
data = data.drop(['enterworld_num','buyitemnowmainauction_num','completechallengeweek_num'],axis=1)
data = data.drop(['actor_account_id'
                    ,'survival_time'
                    ],axis=1)
data = data.drop(columns=data.columns[0],axis=1)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

(995, 33) (995,)
(2985, 33) (2985,)


In [5]:
clf = DecisionTreeClassifier(criterion='gini', 
    splitter='best', 
    max_depth=4, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    min_weight_fraction_leaf=0.0, 
    max_features=20, 
    random_state=None, 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    class_weight=None, 
    ccp_alpha=0.0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [6]:
#F1-score
print("F1 Score")
print('f1 ' , f1_score(y_test, predictions))
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
f1  0.5820642978003384
CrossValidation Scores: 
[0.79396985 0.74246231 0.78015075 0.76758794 0.74371859]
0.77 accuracy with a standard deviation of 0.02


In [7]:
d_tree = clf
d_importances = d_tree.feature_importances_
d_tree_importances = pd.Series(d_importances, index=feature_names)
d_tree_feat_importances = pd.DataFrame(d_tree.feature_importances_, index=d_tree.feature_names_in_, columns=["Importance"])
d_tree_feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
#d_tree_feat_importances.iloc[:10]

In [8]:
num = 30
d_tree_feat_importances['index1'] = d_tree_feat_importances.index
top = d_tree_feat_importances['index1'].iloc[:num].values
a="churn_yn"
topflop = np.append(top,a)
print(topflop)


['sessions_num' 'masteryexp' 'longest_time_between_events' 'event_num'
 'spendmoney_num' 'average_time_between_events'
 'average_money_spent_per_session' 'reason_spendmoney'
 'average_time_between_logins' 'joinparty_num' 'trade_num'
 'itemupgrade_successrate' 'partybattlepoints_max' 'reason_getmoney'
 'partybattles_per_session' 'duels_per_session' 'duel_num' 'has_smurf_yn'
 'gathering_num' 'money_max' 'duel_rating_score_max' 'duelpoints_max'
 'duel_kd' 'partybattle_num' 'targetaccountid_num' 'levelup_num'
 'faction1' 'completechallengetoday_num' 'class' 'level_max' 'churn_yn']


In [39]:
data = pd.read_csv('features.csv',usecols= topflop)
data = data.dropna()
print(data)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

      churn_yn  event_num  joinparty_num  level_min  sessions_num   masteryexp
1          0.0    39385.0          176.0       15.0         167.0   27624894.0
2          0.0    37423.0          148.0        1.0         152.0   30318969.0
3          0.0    33900.0          104.0        1.0          87.0   16523152.0
4          0.0    49063.0          273.0       50.0         187.0   47490847.0
5          1.0     4608.0           12.0       39.0          28.0   12831466.0
...        ...        ...            ...        ...           ...          ...
4015       1.0    37206.0           54.0       23.0         176.0   25902994.0
4016       1.0     4932.0            0.0       50.0           7.0    5193066.0
4017       0.0    28439.0            4.0        1.0         123.0    6787450.0
4018       0.0   103938.0          410.0       50.0         771.0  191019462.0
4019       0.0    63198.0          255.0       50.0         271.0  197575892.0

[3980 rows x 6 columns]
(995, 5) (995,)
(2985, 5) (

In [40]:
param_grid = { 
    'max_features': ['sqrt','log2'],
    'max_depth' : range(2,10),
    'criterion' :['gini','entropy'],
    'min_samples_split' : range (2,10),
    'min_samples_leaf' : range (2,5)
}

In [41]:
tree = DecisionTreeClassifier()
CV_rfc = GridSearchCV(estimator=tree,param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)



In [42]:
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)
param_grid=CV_rfc.best_params_

{'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5}
DecisionTreeClassifier(max_depth=3, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=5)


In [43]:
clf = DecisionTreeClassifier(criterion='gini',
    max_depth=4,
    splitter='best',
    min_samples_split=2, 
    min_samples_leaf=4,
    max_features='sqrt'
    )
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [44]:
#F1-score
print("F1 Score")
print('f1 ' , f1_score(y_test, predictions))
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
f1  0.5915032679738563
CrossValidation Scores: 
[0.77386935 0.75       0.77889447 0.76256281 0.75628141]
0.76 accuracy with a standard deviation of 0.01


In [52]:
filename = 'tree_model.sav'
pickle.dump(clf, open(filename, 'wb'))