## Import

In [23]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV


from sklearn import tree
from sklearn.ensemble import RandomForestClassifier



## Load data

In [24]:
data = pd.read_csv('features.csv')
# Delete unesessary features from dataset
data = data.dropna()
#data = data.drop(['enterworld_num','buyitemnowmainauction_num','completechallengeweek_num'],axis=1)
data = data.drop(['actor_account_id'
                    ,'survival_time'
                    ],axis=1)
data = data.drop(columns=data.columns[0],axis=1)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

(995, 36) (995,)
(2985, 36) (2985,)


In [25]:
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [26]:
#F1-score
print("F1 Score")
mean_score = cross_val_score(forest, X, y, scoring="f1", cv = 10).mean()
std_score = cross_val_score(forest, X, y, scoring="f1", cv = 10).std()
print('Mean F1:', mean_score)
print('Std F1 score', std_score)
#Cross validation
scores = cross_val_score(forest, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
Mean F1: 0.545307406752021
Std F1 score 0.03389820745289171
CrossValidation Scores: 
[0.78015075 0.74371859 0.79271357 0.75125628 0.7701005 ]
0.77 accuracy with a standard deviation of 0.02


In [27]:
result = permutation_importance(
    forest, X_test, y_test, n_repeats=10, random_state=100, n_jobs=2)

In [29]:
num = 5
feat_importances_p = pd.DataFrame(result.importances_mean, index=forest.feature_names_in_, columns=["Importance"])
feat_importances_p['index1'] = feat_importances_p.index
top = feat_importances_p['index1'].iloc[:num].values
#flop = feat_importances_p['index1'].iloc[-num:].values
a="churn_yn"
topflop = np.append(top,a)
#topflop = np.append(topflop,a)
print(topflop)


['event_num' 'enterworld_num' 'levelup_num' 'joinparty_num'
 'spendmoney_num' 'churn_yn']


In [30]:
data = pd.read_csv('features.csv',usecols= topflop)
data = data.dropna()
print(data)
# Setup & split dataset for training
X = data.copy()
y = X.pop('churn_yn')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)
# Print training data shape
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
# Get feature names from training set
feature_names = [f"feature {i}" for i in range(X.shape[1])]

      churn_yn  event_num  enterworld_num  levelup_num  joinparty_num  \
1          0.0    39385.0           193.0          5.0          176.0   
2          0.0    37423.0           176.0         19.0          148.0   
3          0.0    33900.0           102.0         74.0          104.0   
4          0.0    49063.0           237.0          0.0          273.0   
5          1.0     4608.0            28.0          0.0           12.0   
...        ...        ...             ...          ...            ...   
4015       1.0    37206.0           176.0         39.0           54.0   
4016       1.0     4932.0             6.0          5.0            0.0   
4017       0.0    28439.0           122.0        106.0            4.0   
4018       0.0   103938.0           894.0          6.0          410.0   
4019       0.0    63198.0           322.0          0.0          255.0   

      spendmoney_num  
1             1358.0  
2              907.0  
3              540.0  
4              470.0  
5       

In [33]:
param_grid = { 
    'n_estimators': range(900, 1200,100),
    'max_features': ['sqrt','log2'],
    'max_depth' : range(5,20,5),
    'criterion' :['gini','entropy']
}

In [34]:
forest = RandomForestClassifier(random_state=0)
CV_rfc = GridSearchCV(estimator=forest,param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)



In [13]:
print(CV_rfc.best_params_)
param_grid=CV_rfc.best_params_

{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 1000}


In [31]:
forest = RandomForestClassifier(random_state=0,criterion='entropy', max_depth=6,max_features='sqrt',n_estimators=1000)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [32]:
#F1-score
print("F1 Score")
mean_score = cross_val_score(forest, X, y, scoring="f1", cv = 5).mean()
std_score = cross_val_score(forest, X, y, scoring="f1", cv = 5).std()
print('Mean F1:', mean_score)
print('Std F1 score', std_score)
#Cross validation
scores = cross_val_score(forest, X, y, cv=5)
print("CrossValidation Scores: ")
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

F1 Score
Mean F1: 0.5395930088716849
Std F1 score 0.03398441786418995
CrossValidation Scores: 
[0.79271357 0.76256281 0.77512563 0.75125628 0.75753769]
0.77 accuracy with a standard deviation of 0.01


In [16]:
filename = 'forest_model.sav'
pickle.dump(forest, open(filename, 'wb'))