In [82]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV



from sklearn.tree import DecisionTreeClassifier
from sklearn import tree



In [40]:
data = pd.read_csv('features.csv')
# Delete unesessary features from dataset
data = data.dropna()
data = data.drop(['enterworld_num','buyitemnowmainauction_num','completechallengeweek_num'],axis=1)
data = data.drop(['actor_account_id'
                    ,'survival_time'
                    ],axis=1)
data = data.drop(columns=data.columns[0],axis=1)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

(995, 33) (995,)
(2985, 33) (2985,)


In [41]:
clf = DecisionTreeClassifier(criterion='gini', 
    splitter='best', 
    max_depth=4, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    min_weight_fraction_leaf=0.0, 
    max_features=20, 
    random_state=None, 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    class_weight=None, 
    ccp_alpha=0.0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [42]:
#F1-score
print("F1 Score")
print('f1 ' , f1_score(y_test, predictions))
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
f1  0.5938566552901025
CrossValidation Scores: 
[0.77386935 0.75628141 0.75376884 0.74623116 0.75125628]
0.76 accuracy with a standard deviation of 0.01


In [43]:
d_tree = clf
d_importances = d_tree.feature_importances_
d_tree_importances = pd.Series(d_importances, index=feature_names)
d_tree_feat_importances = pd.DataFrame(d_tree.feature_importances_, index=d_tree.feature_names_in_, columns=["Importance"])
d_tree_feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
d_tree_feat_importances.iloc[:10]

Unnamed: 0,Importance
joinparty_num,0.557869
masteryexp,0.245736
sessions_num,0.089058
spendmoney_num,0.038607
gathering_num,0.016008
longest_time_between_events,0.015353
average_money_spent_per_session,0.011615
itemupgrade_successrate,0.009004
event_num,0.008875
reason_spendmoney,0.007877


In [44]:
num = 5
d_tree_feat_importances['index1'] = d_tree_feat_importances.index
top = d_tree_feat_importances['index1'].iloc[:num].values
a="churn_yn"
topflop = np.append(top,a)
print(topflop)


['joinparty_num' 'masteryexp' 'sessions_num' 'spendmoney_num'
 'gathering_num' 'churn_yn']


In [45]:
data = pd.read_csv('features.csv',usecols= topflop)
data = data.dropna()
print(data)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

      churn_yn  joinparty_num  spendmoney_num  sessions_num   masteryexp  \
1          0.0          176.0          1358.0         167.0   27624894.0   
2          0.0          148.0           907.0         152.0   30318969.0   
3          0.0          104.0           540.0          87.0   16523152.0   
4          0.0          273.0           470.0         187.0   47490847.0   
5          1.0           12.0           190.0          28.0   12831466.0   
...        ...            ...             ...           ...          ...   
4015       1.0           54.0          2114.0         176.0   25902994.0   
4016       1.0            0.0            44.0           7.0    5193066.0   
4017       0.0            4.0           934.0         123.0    6787450.0   
4018       0.0          410.0          3120.0         771.0  191019462.0   
4019       0.0          255.0          2481.0         271.0  197575892.0   

      gathering_num  
1              97.0  
2               6.0  
3               0.0  

In [46]:
param_grid = { 
    'max_features': ['sqrt','log2'],
    'max_depth' : range(2,10),
    'criterion' :['gini','entropy'],
    'min_samples_split' : range (2,10),
    'min_samples_leaf' : range (2,5)
}

In [47]:
tree = DecisionTreeClassifier()
CV_rfc = GridSearchCV(estimator=tree,param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)



In [79]:
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)
param_grid=CV_rfc.best_params_

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
DecisionTreeClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=4)


In [80]:
clf = DecisionTreeClassifier(criterion='gini',
    max_depth=4,
    splitter='best',
    min_samples_split=2, 
    min_samples_leaf=4,
    max_features='sqrt'
    )
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


In [81]:
#F1-score
print("F1 Score")
print('f1 ' , f1_score(y_test, predictions))
#Cross validation
scores = cross_val_score(clf, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
f1  0.6104553119730185
CrossValidation Scores: 
[0.77763819 0.73492462 0.77889447 0.74497487 0.76005025]
0.76 accuracy with a standard deviation of 0.02


In [83]:
filename = 'tree_model.sav'
pickle.dump(clf, open(filename, 'wb'))