# 기본데이터 만들기

In [85]:
# ======================================================================================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import numpy as np
import random
from sklearn.model_selection import train_test_split
import scipy as cp
# ======================================================================================================================

scaler = MinMaxScaler()
random.seed(54654)

matplotlib.rcParams['font.family']='Malgun Gothic'   # 한글 사용
matplotlib.rcParams['axes.unicode_minus'] = False

label = pd.read_csv('train_label.csv')



# ======================================================================================================================
print("process - 1")

activity = pd.read_csv('train_activity.csv')

# activity acc_id로 groupby
# - 평균을 내지 않은 이유 : 평균을 냈을 경우 캐릭터는 많지만
#   한 캐릭터만으로 활동한 사람의 정보가 과소평가 될 가능성이 있음
activity = activity.groupby(['acc_id'], as_index = False).sum()
activity.drop(columns = ['day','char_id'], inplace = True)
# print(activity.head())



# ======================================================================================================================


combat = pd.read_csv('train_combat.csv')

combat.drop(columns = ['day', 'server', 'char_id', 'class'], inplace = True)
combat_a = combat.groupby(['acc_id'], as_index = False).sum()
combat_a.drop(columns = 'level', inplace = True)

combat_b = combat.groupby(['acc_id'], as_index = False).max()
#  combat.groupby('acc_id', as_index = False).sum().sort_values('acc_id')

# acc_id 기준으로 정리

combat_b = combat_b[['acc_id', 'level']]
combat = pd.merge(combat_b, combat_a, how = 'inner', on = 'acc_id')



# ======================================================================================================================


payment = pd.read_csv('train_payment.csv')

payment = payment.groupby('acc_id', as_index = False).sum()
payment.drop(columns = 'day', inplace = True)

payment.rename(columns = {'amount_spent' : 'amount_spent_pay'}, inplace = True)


# ======================================================================================================================


trade = pd.read_csv('train_trade.csv')

# 거래에 참여한 횟수를 기준으로

# 판매자로서 활동한 acc_id
trade_a = trade.groupby('source_acc_id', as_index = False).count()
trade_a = trade_a[['source_acc_id', 'day']]

# 구매자로서 활동한 acc_id
trade_b = trade.groupby('target_acc_id', as_index = False).count()
trade_b = trade_b[['target_acc_id', 'day']]

x = trade_a['day'].sum() - trade_b['day'].sum()
# print(x) # 0

trade_a.rename(columns={'source_acc_id':'acc_id',
                        'day':'sell_item_cnt'}, inplace=True)
trade_b.rename(columns={'target_acc_id':'acc_id',
                        'day':'buy_item_cnt'}, inplace=True)

trade = pd.merge(trade_a, trade_b, how = 'outer', on = 'acc_id')

# 실제 데이터 검색
# trade[trade['source_acc_id'] == 6].count()

# 데이터 검증
# trade_a[trade_a['source_acc_id'] == 6]


# ======================================================================================================================


pledge = pd.read_csv('train_pledge.csv')

ple_1 = pledge.groupby(['server', 'pledge_id', 'day'], as_index = False).mean()
ple_1.drop(columns = ['acc_id', 'char_id', 'day'], inplace = True)
ple_1 = ple_1.groupby(['server', 'pledge_id'], as_index = False).sum()

ple_a = pledge.groupby(['acc_id', 'char_id', 'server', 'pledge_id', 'day'], as_index = False).mean()
ple_a = ple_a.groupby(['acc_id', 'char_id', 'server', 'pledge_id'], as_index = False).sum()
ple_a = ple_a[['acc_id', 'char_id', 'server', 'pledge_id']]

pledge = pd.merge(ple_a, ple_1, how = 'outer', on = ['server', 'pledge_id'])
pledge.drop(columns = ['char_id', 'server', 'pledge_id'], inplace = True)
pledge_total = pledge.groupby(['acc_id'], as_index = False).mean()

pledge_total.rename(columns = {'random_attacker_cnt' : 'random_attacker_cnt_plg',
                              'random_defender_cnt' : 'random_defender_cnt_plg',
                              'same_pledge_cnt' : 'same_pledge_cnt_plg',
                              'temp_cnt' : 'temp_cnt_plg',
                              'etc_cnt':'etc_cnt_plg'}, inplace = True)

# ple_1[(ple_1['pledge_id'] == 25467) & (ple_1['server'] == 'aq')]
# ple_a[(ple_a['pledge_id'] == 25467) & (ple_a['server'] == 'aq')]


# ======================================================================================================================
print("process - 2")

# label + activity
label_a = pd.merge(label, activity, how = 'outer', on = 'acc_id')

# (label + activity) + combat
label_b = pd.merge(label_a, combat, how = 'outer', on = 'acc_id')

# (label + activity + combat) + payment
label_c = pd.merge(label_b, payment, how = 'outer', on = 'acc_id')

# (label + activity + combat + payment) + trade
label_d = pd.merge(label_c, trade, how = 'outer', on = 'acc_id')
label_d = label_d[label_d['survival_time'] >= 1]

# (label + activity + combat + payment + trade) + pledge_total
label_z = pd.merge(label_d, pledge_total, how = 'outer', on = 'acc_id')

data = label_z.fillna(0)
print(data.columns)

data[['playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent', 'amount_spent_pay', 'sell_item_cnt', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time']] = \
    scaler.fit_transform(data[['playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent', 'amount_spent_pay', 'sell_item_cnt', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time']])

data = data.fillna(0)
print(data.columns)
# ======================================================================================================================


process - 1
process - 2
Index(['acc_id', 'survival_time', 'amount_spent', 'playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent', 'amount_spent_pay', 'sell_item_cnt', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time'],
      dtype='object')
Index(['acc_id', 'survival_time', 'amount_spent', 'playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', '

## 데이터를 생존과 과금여부로 나눠보기

In [89]:
# 생존여부
data.loc[data['survival_time'] < 64, "survived"] = 0
data.loc[data['survival_time'] == 64, "survived"] = 1

# 과금 여부
data.loc[data['amount_spent'] == 0, "cash"] = 0
data.loc[data['amount_spent'] > 0, "cash"] = 1

# data.drop(['acc_id','survival_time','amount_spent', 'survived'], axis=1, inplace = True)

data.head()

# 나눈 것을 다시 한번 숫자 카운팅 해보기
print(data['survived'][data['survived']==0].count(),'\n',
     data['survived'][data['survived']==1].count())

print(data['cash'][data['cash']==0].count(),'\n',
     data['cash'][data['cash']==1].count())

print(data.columns)

amount_spent = data[['acc_id','amount_spent']]
data1 = data.drop(columns=['amount_spent','cash','survived','survival_time'], axis=1)
data1.shape
# ======================================================================================================================

18004 
 21996
16438 
 23562
Index(['acc_id', 'survival_time', 'amount_spent', 'playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent', 'amount_spent_pay', 'sell_item_cnt', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time', 'survived', 'cash'],
      dtype='object')


(40000, 35)

In [None]:
['acc_id','playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent',  'sell_item_cnt','amount_spent_pay', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time']

# 생존여부로 결증트리, 랜덤포레스트

In [56]:
# ======================================================================================================================


# data.survived.sum()

#훈련세트, 테스트세트 나누기
from sklearn.model_selection import train_test_split

y = data['survived']


# shap에서 겹치는 공통 변수

# x = data[['combat_play_time',
#  'death',
#  'enchant_count',
#  'etc_cnt',
#  'etc_cnt_plg',
#  'exp_recovery',
#  'level',
#  'non_combat_play_time',
#  'num_opponent',
#  'party_exp',
#  'play_char_cnt',
#  'pledge_combat_cnt',
#  'private_shop',
#  'quest_exp',
#  'random_attacker_cnt',
#  'random_attacker_cnt_plg',
#  'random_defender_cnt',
#  'random_defender_cnt_plg',
#  'same_pledge_cnt_plg',
#  'sell_item_cnt',
#  'temp_cnt',
#  'temp_cnt_plg']]

x = data[['playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent',  'sell_item_cnt','amount_spent_pay', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time']]







train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

train_x.shape, test_x.shape, train_y.shape, test_y.shape

## DecisionTree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

decision = DecisionTreeClassifier(max_depth = 5).fit(train_x,train_y)
# print(decision.score(train_x, train_y))
print(decision.score(test_x, test_y))

from sklearn.tree import export_graphviz
import graphviz
import pydot

export_graphviz(decision,
                feature_names = x.columns,
                class_names = ['Death', 'Survived'],
                out_file = 'decisionTree1.dot',
                impurity = True,
                filled = True)

# Encoding 중요
# (graph,) = pydot.graph_from_dot_file('decisionTree1.dot', encoding='utf8')

# Dot 파일을 Png 이미지로 저장
# graph.write_png('decisionTree1.png')

with open('decisionTree1.dot') as f:
    dot_graph = str(open("decisionTree1.dot", "rb").read(), 'utf8')

src = graphviz.Source(dot_graph)
src.render('payment-lable.gv', view=True)


# ======================================================================================================================
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(train_x,train_y)
# print(clf.score(train_x, train_y))
print(rfc.score(test_x, test_y))
rfc_y_score = rfc.predict_proba(test_x)

0.6936666666666667
0.7395


# 과금여부로 결정트리, 랜덤포레스트

In [78]:
# ======================================================================================================================


# data.survived.sum()

#훈련세트, 테스트세트 나누기
from sklearn.model_selection import train_test_split

y = data['cash']

# shap에서 겹치는 공통 변수
# x = data[['combat_play_time',
#  'death',
#  'enchant_count',
#  'etc_cnt',
#  'etc_cnt_plg',
#  'exp_recovery',
#  'level',
#  'non_combat_play_time',
#  'num_opponent',
#  'party_exp',
#  'play_char_cnt',
#  'pledge_combat_cnt',
#  'private_shop',
#  'quest_exp',
#  'random_attacker_cnt',
#  'random_attacker_cnt_plg',
#  'random_defender_cnt',
#  'random_defender_cnt_plg',
#  'same_pledge_cnt_plg',
#  'sell_item_cnt',
#  'temp_cnt',
#  'temp_cnt_plg']]

x = data[['acc_id','playtime', 'npc_kill',
       'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive',
       'exp_recovery', 'fishing', 'private_shop', 'game_money_change',
       'enchant_count', 'level', 'pledge_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'temp_cnt', 'same_pledge_cnt', 'etc_cnt',
       'num_opponent',  'sell_item_cnt','amount_spent_pay', 'buy_item_cnt',
       'play_char_cnt', 'combat_char_cnt', 'pledge_combat_cnt',
       'random_attacker_cnt_plg', 'random_defender_cnt_plg',
       'same_pledge_cnt_plg', 'temp_cnt_plg', 'etc_cnt_plg',
       'combat_play_time', 'non_combat_play_time']]







train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

train_x.shape, test_x.shape, train_y.shape, test_y.shape

## DecisionTree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

decision = DecisionTreeClassifier(max_depth = 6).fit(train_x,train_y)
# print(decision.score(train_x, train_y))
print(decision.score(test_x, test_y))

from sklearn.tree import export_graphviz
import graphviz
import pydot

export_graphviz(decision,
                feature_names = x.columns,
                class_names = ['No_cash', 'Yes_cash'],
                out_file = 'decisionTree1.dot',
                impurity = True,
                filled = True)

# Encoding 중요
# (graph,) = pydot.graph_from_dot_file('decisionTree1.dot', encoding='utf8')

# Dot 파일을 Png 이미지로 저장
# graph.write_png('decisionTree1.png')

with open('decisionTree1.dot') as f:
    dot_graph = str(open("decisionTree1.dot", "rb").read(), 'utf8')

src = graphviz.Source(dot_graph)
src.render('payment-lable.gv', view=True)


# ======================================================================================================================
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(train_x,train_y)
# print(clf.score(train_x, train_y))
print(rfc.score(test_x, test_y))
rfc_y_score = rfc.predict_proba(test_x)

0.8431666666666666
0.8585833333333334


### 랜덤포레스트 결과에서 예측값 뽑기

In [79]:
rfc_y_score = rfc.predict(data1)
rfc_y = pd.DataFrame(rfc_y_score, columns = ['cash_predict'])
# print(type(rfc_y))


data_cash = pd.concat([data1, rfc_y], axis=1)
data_cash.head()
rfc_y.shape
data_cash.columns

ValueError: Number of features of the model must match the input. Model n_features is 35 and input n_features is 34 

In [76]:

data_cash.shape
print(data_cash[data_cash['cash_predict']==1].shape, data[data['amount_spent']>0].shape)
print(data_cash[data_cash['cash_predict']==0].shape, data[data['amount_spent']==0].shape)


(24960, 36) (23562, 39)
(15040, 36) (16438, 39)


In [88]:
data_cash_yes = data_cash[data_cash['cash_predict']==1]

data_cash_yes = pd.merge([data_cash_yes, amount_spent], on='acc_id', how='inner')

data_cash_yes_data = [ data_cash_yes.iloc[i,:].tolist() for i in range(len(data_cash_yes.index.values))]
X = data_cash_yes_data

y_data = [[i] for i in data[data['amount_spent']>0].tolist() ]

y= y_data

TypeError: merge() missing 1 required positional argument: 'right'

In [None]:
from sklearn.model_selection import train_test_split

x_data, X_test, y_data, y_test = train_test_split(X, y, test_size = 1/3, random_state = 70) #42