In [1]:
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시

In [2]:
train = pd.read_pickle('data/data/train.pkl')

In [3]:
train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.00,0,0,0,"at (145.25, 21.5078125)"
1,0,1,0.00,1,0,0,"at (22.75, 147.0078125)"
2,0,1,0.02,0,0,1,['OrbitalCommand [3080001]']
3,0,1,0.02,0,0,2,(1360) - TrainSCV
4,0,1,0.14,0,0,0,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
67091771,38871,0,8.51,0,2,0,"at (139.578125, 62.58203125)"
67091772,38871,0,8.52,1,0,5,
67091773,38871,0,8.52,0,2,0,"at (122.42578125, 45.4296875)"
67091774,38871,0,8.52,0,2,0,"at (122.42578125, 43.25390625)"


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67091776 entries, 0 to 67091775
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   game_id         uint16 
 1   winner          uint8  
 2   time            float32
 3   player          uint8  
 4   species         uint8  
 5   event           uint8  
 6   event_contents  object 
dtypes: float32(1), object(1), uint16(1), uint8(4)
memory usage: 1.1+ GB


In [5]:
train['game_id'].unique()

array([    0,     1,     2, ..., 38869, 38870, 38871], dtype=uint16)

In [17]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
        
    species = df.groupby(['game_id', 'player']).species.unique()
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()
    
    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        
        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])        
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_species, df_P0_event, df_P1_species, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'
        
        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)
    
    return x_data, y_data

In [18]:
x_train, y_train = data_preparation(train, answer=True)
x_train.head()

  0%|                                                                                        | 0/38872 [00:00<?, ?it/s]


ValueError: 