# 변수 설명

game_id : 경기 구분 기호

winner : 승리 선수 --> 한 게임에서 0과 1이라는 플레이어가 있는데 둘 중 누가 이긴지를 의미함

time : 경기 시간

player : 선수

##1) 0: 첫 번째 선수

##2) 1: 두 번째 선수

species : 종족

##1) T: 테란

##2) P: 프로토스

##3) Z: 저그

event : 행동 종류

event_contents : 행동 상세

##1) Ability : 생산, 공격 등 선수의 주요 행동

##2) AddToControlGroup : 부대에 추가

##3) Camera : 시점 선택

##4) ControlGroup : 부대 행동

##5) GetControlGroup : 부대 불러오기

##6) Right Click : 마우스 우클릭

##7) Selection : 객체 선택

##8) SetControlGroup : 부대 지정

In [4]:
# 패키지 불러오기

import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시

# 패키지 불러오기

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno # 결측값

import warnings # 경고무시
warnings.filterwarnings('ignore')

%matplotlib inline

In [5]:
pd.options.display.float_format = '{:.2f}'.format # 소수점 둘째자리까지 표현


In [6]:
train = pd.read_pickle('train_17m.pickle')

In [7]:
train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.00,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.00,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
17282259,10000,1,9.58,0,T,Right Click,"Target: None [026C0001]; Location: (67.0, 14.5..."
17282260,10000,1,9.59,1,Z,GetControlGroup,
17282261,10000,1,9.59,0,T,Ability,(1360) - TrainSCV
17282262,10000,1,9.59,1,Z,GetControlGroup,


In [8]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
        
    species = df.groupby(['game_id', 'player']).species.unique()
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()
    
    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        
        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])        
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_species, df_P0_event, df_P1_species, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'
        
        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)
    
    return x_data, y_data

In [9]:
x_train, y_train = data_preparation(train, answer=True)
x_train.head()

100%|████████████████████████████████████████████████████████████████████████████| 10001/10001 [02:11<00:00, 75.89it/s]


Unnamed: 0_level_0,P0_species,P0_Ability,P0_AddToControlGroup,P0_Camera,P0_ControlGroup,P0_GetControlGroup,P0_Right Click,P0_Selection,P0_SetControlGroup,P1_species,...,P1_Selection,P1_SetControlGroup,delta_Ability,delta_AddToControlGroup,delta_Camera,delta_ControlGroup,delta_GetControlGroup,delta_Right Click,delta_Selection,delta_SetControlGroup
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,34.0,2.0,444.0,0.0,24.0,35.0,50.0,3.0,0,...,57.0,1.0,0.0,2.0,19.0,0.0,21.0,7.0,-7.0,2.0
1,1,77.0,1.0,627.0,0.0,162.0,160.0,186.0,10.0,0,...,116.0,8.0,10.0,1.0,-231.0,0.0,131.0,29.0,70.0,2.0
2,1,69.0,6.0,413.0,0.0,99.0,160.0,90.0,14.0,2,...,232.0,9.0,-16.0,1.0,-312.0,-2.0,-10.0,-44.0,-142.0,5.0
3,0,82.0,0.0,713.0,0.0,132.0,276.0,180.0,6.0,1,...,148.0,19.0,-7.0,0.0,325.0,0.0,-578.0,8.0,32.0,-13.0
4,0,57.0,1.0,430.0,0.0,224.0,177.0,67.0,10.0,2,...,126.0,8.0,21.0,-3.0,158.0,0.0,125.0,71.0,-59.0,2.0


In [10]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [11]:
# 모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

# 이 예제에서는 7개 하이퍼 파라미터에 대해 30회 조정을 시도했습니다.
# 다양한 하이퍼 파라미터, 더 많은 iteration을 시도하여 최상의 모델을 얻어보세요!
# LightGBM Classifier: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5956  [0m | [0m 0.0708  [0m | [0m 0.08152 [0m | [0m 790.0   [0m | [0m 304.6   [0m | [0m 1.931   [0m | [0m 48.95   [0m | [0m 0.4062  [0m |
| [95m 2       [0m | [95m 0.6206  [0m | [95m 0.7578  [0m | [95m 0.009006[0m | [95m 328.4   [0m | [95m 639.9   [0m | [95m 4.599   [0m | [95m 10.92   [0m | [95m 0.6635  [0m |
| [0m 3       [0m | [0m 0.6035  [0m | [0m 0.6787  [0m | [0m 0.09504 [0m | [0m 299.5   [0m | [0m 640.8   [0m | [0m 3.833   [0m | [0m 20.02   [0m | [0m 0.9427  [0m |
| [0m 4       [0m | [0m 0.5999  [0m | [0m 0.9299  [0m | [0m 0.09484 [0m | [0m 394.5   [0m | [0m 361.0   [0m | [0m 6.648   [0m | [0m 2.116   [0m | [0m 0.2322  [0m |
| [0m 5       [0m | [0m 0.6172  [0m | 

In [12]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

In [13]:
test = pd.read_csv('data/test.csv')
x_test, _ = data_preparation(test, answer=False)

100%|████████████████████████████████████████████████████████████████████████████| 16787/16787 [03:49<00:00, 73.16it/s]


In [14]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

In [15]:
submission = pd.read_csv('data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('submission.csv')
submission.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.56
38873,0.51
38874,0.47
38875,0.36
38876,0.45


-------------------------------------------------------------------------------

## APM
1분에 몇번을 움직였는지?

In [None]:
df['game_id'].max()

In [None]:
x_train['P0_APM'] = 0
x_train['P1_APM'] = 0
x_train

In [None]:
x_test['P0_APM'] = 0
x_test['P1_APM'] = 0
x_test

In [None]:
x_train.columns

In [None]:
x_train.index.max()

In [None]:
x_train['P0_APM'] = x_train['P0_APM'].index

In [None]:
x_train['P1_APM'] = x_train['P1_APM'].index

In [None]:
def p0_apm(i):  
    return x_train.iloc[i, [1, 2, 4, 5, 6, 7, 8]].sum() / train.loc[train['game_id']==i, 'time'].max()


def p1_apm(i):
    return x_train.iloc[i, [10, 11, 13, 14, 15, 16, 17]].sum() / train.loc[train['game_id']==i, 'time'].max()

In [None]:
x_train['P0_APM'] = x_train['P0_APM'].apply(p0_apm)
x_train['P1_APM'] = x_train['P1_APM'].apply(p1_apm)

In [None]:
apm = np.arange(len(x_test))

In [None]:
x_test['P0_APM'] = apm

In [None]:
x_test['P1_APM'] = apm

In [None]:
x_test

In [None]:
# time은 데이터프레임에서 가져오지 말고, 시리즈를 변수에 할당시키는 방법이 낫다.
test_time = test.groupby('game_id')['time'].max()
test_time

In [None]:
def p0_apm(i):  
    return x_test.iloc[i, [1, 2, 4, 5, 6, 7, 8]].sum() / test_time.iloc[i]


def p1_apm(i):
    return x_test.iloc[i, [10, 11, 13, 14, 15, 16, 17]].sum() / test_time.iloc[i]

In [None]:
x_test['P0_APM'] = x_test['P0_APM'].apply(p0_apm)
x_test['P1_APM'] = x_test['P1_APM'].apply(p1_apm)

In [None]:
x_test

In [None]:
x_train

In [None]:
x_train['delta_APM'] = x_train['P0_APM'] - x_train['P1_APM']

In [None]:
x_test['delta_APM'] = x_test['P0_APM'] - x_test['P1_APM']

In [None]:
def apm_abstract_train(x):
    if x >= 0:
        return 0
    else:
        return 1
x_train['APM_versus'] = x_train['delta_APM'].apply(apm_abstract_train)
        

In [None]:
def apm_abstract_test(x):
    if x >= 0:
        return 0
    else:
        return 1
x_test['APM_versus'] = x_test['delta_APM'].apply(apm_abstract_test)
        

In [None]:
# APM차이와 winner의 상관관계를 파악
corr_train = pd.DataFrame()
corr_train['winner'] = y_train
corr_train['APM_versus'] = x_train['APM_versus']
corr_train['P0_APM'] = x_train['P0_APM']
corr_train['P1_APM'] = x_train['P1_APM']
corr_train.info()

In [None]:
corr_train.corr()

In [None]:
sns.heatmap(corr_train.corr(), annot=True, cmap='RdYlGn', linewidths=0.2, annot_kws={'size':35})
fig = plt.gcf()

fig.set_size_inches(18, 15)
#plt.xticks(fontsize=14)
plt.show()

In [None]:
x_train.iloc[1, [1,2,4,5,6,7,8]].sum() / train.loc[train['game_id']==1, 'time'].max()

In [None]:
train.loc[train['game_id']==1, 'time'].max()

In [None]:
x_train.iloc[1,[1,2,4,5,6,7,8]].sum() / 9.59

In [None]:
def apm(df):
    if 
    len(df[(df['game_id']==num) & (df['player'] == 0) & (df['event'] != 'Camera')]) / (df.loc[df['game_id'] == 1, 'time'].max())
    len(df[(df['game_id']==num) & (df['player'] == 1) & (df['event'] != 'Camera')]) / (df.loc[df['game_id'] == 1, 'time'].max())
        

In [None]:
# 1번방의 플레이어0의 APM
len(df[(df['game_id']==1) & (df['player'] == 0) & (df['event'] != 'Camera')]) / (df.loc[df['game_id'] == 1, 'time'].max())

In [None]:
# 1번방의 플레이어1의 APM
len(df[(df['game_id']==1) & (df['player'] == 1) & (df['event'] != 'Camera')]) / (df.loc[df['game_id'] == 1, 'time'].max())

In [None]:
df[(df['game_id']==1) & (df['player'] == 1) & (df['event'] != 'Camera')]

# 일꾼 생산

In [None]:
# 플레이어별 카메라 움직임 스캐터로 표현

def plot_camera(df, game_id):
    df = df.loc[df['game_id']==game_id] # 입력한 게임 방
    df = df.loc[df['event']=='Camera'] # 그 방에서 카메라 부분만
    df_0 = df.loc[df['player']==0] # 그 방에서 플레이어 0
    df_1 = df.loc[df['player']==1] # 그 방에서 플레이어 1

    winner = df['winner'].iloc[0] # 그 방에서 위너에서 가장 위에 뜨는 플레이어(승리자)
    game_time = df['time'].values[-1] # 시간에서 가장 마지막 시간(총 게임 시간)
    player_0_species = df_0['species'].iloc[0] # 해당 게임 0플레이어 종족
    player_1_species = df_1['species'].iloc[0] # 해당 게임 1플레이어 종족 

    player_0_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in df_0['event_contents']]).astype(float)
    player_1_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in df_1['event_contents']]).astype(float)
    
    plt.scatter(player_0_camera[:, 0], player_0_camera[:, 1], label='player_0', alpha=0.3, color='b', s=50)
    plt.scatter(player_1_camera[:, 0], player_1_camera[:, 1], label='player_1', alpha=0.3, color='r', s=50)
    plt.legend()
    plt.show()
    
    print('Total game time: %s'%(game_time))
    print('Winner: Player_%i'%(winner))
    print('Player_0: %s'%(player_0_species))
    print('Player_1: %s'%(player_1_species))

## 승리자와 가까운 카메라 위치

In [None]:
df[(df['game_id'] == 1) & (df['event'] == 'Camera')].head(2)

In [None]:
df[(df['game_id'] == 1) & (df['event'] == 'Camera')].tail(10)

# 카메라 시점

In [None]:
# 패턴으로 x축 y축만 리스트로 추출
import re

def get_vector(tokken):
    pattern = '\d*[.]\d*'
    pat = re.compile(pattern)
    result = pat.findall(str(tokken))
    result = [float(i) for i in result]
    return np.array(result, dtype=np.float32)

In [None]:
df_camera['event_contents'] = df_camera['event_contents'].apply(get_vector)
df[df['event'] == 'Camera'] = df_camera

In [None]:
abs(df_one.iloc[-1, 6] - df_one.iloc[0, 6])

In [None]:
# 플레이어별 카메라 움직임 스캐터로 표현

def plot_camera(df, game_id):
    df = df.loc[(df['game_id']==game_id) & (df['event']=='Camera')] # 입력한 게임 방
    df_0 = df.loc[df['player']==0] # 그 방에서 플레이어 0
    df_1 = df.loc[df['player']==1] # 그 방에서 플레이어 1

    winner = df['winner'].iloc[0] # 그 방에서 위너에서 가장 위에 뜨는 플레이어(승리자)
    game_time = df['time'].values[-1] # 시간에서 가장 마지막 시간(총 게임 시간)
    player_0_species = df_0['species'].iloc[0] # 해당 게임 0플레이어 종족
    player_1_species = df_1['species'].iloc[0] # 해당 게임 1플레이어 종족 

    player_0_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in df_0['event_contents']]).astype(float)
    player_1_camera = np.array([item.replace('at (', '').replace(')', '').split(',') for item in df_1['event_contents']]).astype(float)
    
    plt.scatter(player_0_camera[:, 0], player_0_camera[:, 1], label='player_0', alpha=0.3, color='b', s=50)
    plt.scatter(player_1_camera[:, 0], player_1_camera[:, 1], label='player_1', alpha=0.3, color='r', s=50)
    plt.legend()
    plt.show()
    
    print('Total game time: %s'%(game_time))
    print('Winner: Player_%i'%(winner))
    print('Player_0: %s'%(player_0_species))
    print('Player_1: %s'%(player_1_species))

In [None]:
df_0 = df.loc[(df['game_id']==2) & (df['event'] == 'Camera') & (df['player']==0)]
df_0

In [None]:
df_1 = df.loc[(df['game_id']==2) & (df['event'] == 'Camera') & (df['player']==1)]
df_1

In [None]:
df_0.iloc[0, 6] - df_0.iloc[-1,6]

In [None]:
df_1.iloc[0, 6] - df_1.iloc[-1,6]