# Import Module

In [206]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
import os
warnings.filterwarnings('ignore')
print(os.listdir('../dataset'))

['Challenger_Ranked_Games_10minute.csv', 'Challenger_Ranked_Games_15minute.csv', 'preprocessing', 'preprocessPreprocess_test.csv', 'preprocessPreprocess_train.csv']


# Load Data

## Raw

In [207]:
select_data_loc = os.listdir('../dataset')[0]

data = pd.read_csv('../dataset/' + select_data_loc)
print("Full Dataset Shape: ", data.shape)

# delete label
data.drop(columns = 'redWins', inplace = True)

Full Dataset Shape:  (26409, 51)


### Check Data Shape

In [208]:
data.head()

Unnamed: 0,gameId,blueWins,blueTotalGolds,blueCurrentGolds,blueTotalLevel,blueAvgLevel,blueTotalMinionKills,blueTotalJungleMinionKills,blueFirstBlood,blueKill,...,redFirstTowerLane,redTowerKills,redMidTowerKills,redTopTowerKills,redBotTowerKills,redInhibitor,redFirstDragon,redDragnoType,redDragon,redRiftHeralds
0,4247263043,0,14870,2889,32,6.4,199,53,0,3,...,[],0,0,0,0,0,1,['WATER_DRAGON'],1,0
1,4247155821,1,14497,2617,33,6.6,229,44,0,2,...,[],0,0,0,0,0,0,[],0,0
2,4243963257,0,15617,1757,34,6.8,223,39,0,3,...,['BOT_LANE'],1,0,0,1,0,1,['FIRE_DRAGON'],1,1
3,4241678498,0,15684,1439,35,7.0,251,64,0,3,...,[],0,0,0,0,0,0,[],0,0
4,4241538868,1,17472,3512,35,7.0,257,46,0,7,...,[],0,0,0,0,0,0,[],0,0


# Basic Process

## Train Test Split

Use the seed used in Who_is_win_Exploration Data Analysis_v1. Therefore, training data and test data are the same as before.

In [209]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = 'blueWins'), data['blueWins'], test_size = 0.25, random_state = 42)

train = pd.concat([X_train, y_train], axis = 1)
train.reset_index(drop = True, inplace = True)
test = pd.concat([X_test, y_test], axis = 1).reset_index(drop = True)
test.reset_index(drop = True, inplace = True)

print("Train Data Shape : ", train.shape)
print("Test Data Shape : ", test.shape)

Train Data Shape :  (19806, 50)
Test Data Shape :  (6603, 50)


# Main Process

## Create Derived Feature

The highest feature importance in the baseline model could be seen as a kd/a feature. Therefore, the kd/a variable is created first.

In [210]:
def DerivedFeature_KDA(data):    
    data['blueKd/a'] = (data['blueKill'] + data['blueAssist']) / data['blueDeath']
    data['redKd/a'] = (data['redKill'] + data['redAssist']) / data['redDeath']
    
    data.loc[~np.isfinite(data['blueKd/a']), 'blueKd/a'] = (data['blueKill'] + data['blueAssist']) / 1
    data.loc[~np.isfinite(data['redKd/a']), 'redKd/a'] = (data['redKill'] + data['redAssist']) / 1
    
    return data

In [211]:
train = DerivedFeature_KDA(train)
test = DerivedFeature_KDA(test)

### Categorical Encode

- blueFirstTowerLane: Blue team's first kill lane for 10 minutes.


- blueDragnoType: Type of Dragon killed by Blue Team for 10 minutes.


- redFirstTowerLane: Red team's first kill lane for 10 minutes.


- redDragnoType: Type of Dragon killed by Red Team for 10 minutes.

#### DragonType

In League of Legends, the dragon's first appearance is five minutes after the game starts. Five minutes after the first dragon was killed, a new dragon appeared, and there are observations of multiple dragons in the data.

In fact, each accounted for only 0.3% of odd values in exploratory data analysis. Therefore, the value of the dragon being disposed of more than once is recognized as an observation error and the observed value is removed.

Elder Dragon is a dragon that appears when a team kills four dragons. Because it cannot occur realistically, this observation is also removed.

Finally, these observations are excluded because there is no such situation where both teams have eliminated the dragon.

Reference: [리그 오브 레전드/드래곤 - 나무위키](https://namu.wiki/w/%EB%A6%AC%EA%B7%B8%20%EC%98%A4%EB%B8%8C%20%EB%A0%88%EC%A0%84%EB%93%9C/%EB%93%9C%EB%9E%98%EA%B3%A4#s-2.2)

In [212]:
dragon_type = train['blueDragnoType'].unique().tolist()
true_value = dragon_type[:5] + ["[ELDER_DRAGON]"]
print("True Dragon Type Value: ", true_value, '\n')

train = train.loc[(train['blueDragnoType'].isin(true_value)) & (train['redDragnoType'].isin(true_value))]
train = train.loc[~((train['blueDragnoType'] != "[]") &(train['redDragnoType'] != '[]'))].reset_index(drop = True)
print('Train Data Shape:', train.shape)

test = test.loc[(test['blueDragnoType'].isin(true_value)) & (test['redDragnoType'].isin(true_value))]
test = test.loc[~((test['blueDragnoType'] != "[]") &(test['redDragnoType'] != '[]'))].reset_index(drop = True)
print('test Data Shape:', test.shape)

True Dragon Type Value:  ["['AIR_DRAGON']", '[]', "['WATER_DRAGON']", "['FIRE_DRAGON']", "['EARTH_DRAGON']", '[ELDER_DRAGON]'] 

Train Data Shape: (19693, 52)
test Data Shape: (6571, 52)


It can be seen that the variables associated with Dragon Elimination are FirstDragon and DragonType. Within 10 minutes of the start of the game, there are a total of three situations in Dragon Elimination.

1. The blue team killed the dragon


2. The red team killed the dragon


3. The dragon has not been killed

Generates feature that can represent these three.

<img src="../image/Dragon_Feature.png" width="450">

It can be seen that the FirstDragon feature is a higher concept for DragonType features. In other words, DragonType is the detailed feature of FirstDragon.

Therefore, the FirstDragon feature is not used.

The three cases above may be described using the DragonType feature as follows:

1. [__DRAGON], []
2. [], [__DRAGON]
3. [], []

Merge the two features to create derived features associated with Dragon Kill. <b> It is expected that this will reduce unnecessary dimensions.</b>

In addition, Dragon, FirstDragon, is a duplicate feature and therefore all are deleted. Because DragonKill features can all explain.

In [213]:
train['dragonKill'] = train['blueDragnoType'] + train['redDragnoType']
train.drop(columns = ['blueDragnoType', 'redDragnoType', 'blueDragon', 'blueFirstDragon', 'redDragon', 'redFirstDragon'], inplace = True)

print('Train Dataset Shape:', train.shape)

test['dragonKill'] = test['blueDragnoType'] + test['redDragnoType']
test.drop(columns = ['blueDragnoType', 'redDragnoType', 'blueDragon', 'blueFirstDragon', 'redDragon', 'redFirstDragon'], inplace = True)

print('test Dataset Shape:', test.shape)

Train Dataset Shape: (19693, 47)
test Dataset Shape: (6571, 47)


In [214]:
dragon_encoder = OneHotEncoder()
dragon_cat = dragon_encoder.fit_transform(train['dragonKill'].values.reshape(-1, 1))
dragon_cat_cols = dragon_encoder.get_feature_names('D')
dragon_df = pd.DataFrame(dragon_cat.todense(), columns = dragon_cat_cols)

train = pd.concat([train.drop(columns = 'dragonKill'), dragon_df], axis = 1)

train.head()

Unnamed: 0,gameId,blueTotalGolds,blueCurrentGolds,blueTotalLevel,blueAvgLevel,blueTotalMinionKills,blueTotalJungleMinionKills,blueFirstBlood,blueKill,blueDeath,...,redKd/a,D_['AIR_DRAGON'][],D_['EARTH_DRAGON'][],D_['FIRE_DRAGON'][],D_['WATER_DRAGON'][],D_[]['AIR_DRAGON'],D_[]['EARTH_DRAGON'],D_[]['FIRE_DRAGON'],D_[]['WATER_DRAGON'],D_[][]
0,4204710612,19975,3374,35,7.0,216,38,0,13,6,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4153551749,15892,4012,36,7.2,215,57,0,5,4,...,1.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4139787761,16270,2485,35,7.0,227,56,0,5,6,...,3.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4202227127,16605,3290,34,6.8,194,28,0,8,10,...,1.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4189344068,14980,665,33,6.6,199,56,0,4,13,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [215]:
dragon_cat = dragon_encoder.transform(test['dragonKill'].values.reshape(-1, 1))
dragon_cat_cols = dragon_encoder.get_feature_names('D')
dragon_df = pd.DataFrame(dragon_cat.todense(), columns = dragon_cat_cols)

test = pd.concat([test.drop(columns = 'dragonKill'), dragon_df], axis = 1)

test.head()

Unnamed: 0,gameId,blueTotalGolds,blueCurrentGolds,blueTotalLevel,blueAvgLevel,blueTotalMinionKills,blueTotalJungleMinionKills,blueFirstBlood,blueKill,blueDeath,...,redKd/a,D_['AIR_DRAGON'][],D_['EARTH_DRAGON'][],D_['FIRE_DRAGON'][],D_['WATER_DRAGON'][],D_[]['AIR_DRAGON'],D_[]['EARTH_DRAGON'],D_[]['FIRE_DRAGON'],D_[]['WATER_DRAGON'],D_[][]
0,4179175415,30612,15812,49,9.8,111,0,0,11,10,...,3.636364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4216106860,16318,1623,34,6.8,242,43,0,6,5,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4181519175,18743,3093,37,7.4,240,53,0,9,5,...,1.111111,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4240705476,17890,3815,36,7.2,241,52,0,9,4,...,1.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4230835112,16334,2789,34,6.8,220,48,0,5,6,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Tower

There are four situations.
1. Only the blue team kill


2. Only the red team kill


3. both sides kill


4. both sides didn't kill

First, the TowerKills feature can be subdivided into MidTowerKills, TopTowerKills and BotTowerKills. Therefore, the TowerKills variable is not considered.

FirstTower is a feature that shows which team got the first kill. No FirstTower = 1 exists for both teams.Therefore, three situations can be explained using two features.

1. Blue Team First Kill


2. Red Team First Kill


3. No First Kill Occurred

<img src="../image/Tower_Feature.png" width="300">


The above two features can be seen as One-Hot Encode features for the First Kill.However, you can see that the FirstTowerLane feature allows you to subdivide the FirstTowern. <b> Therefore, the FirstTower feature is not used. </b>




In [216]:
train['FirstKillLane'] = train['blueFirstTowerLane'] + train['redFirstTowerLane']
train.drop(columns = ['blueFirstTowerLane', 'redFirstTowerLane'], inplace = True)
print('Train Data Shape: ', train.shape)

test['FirstKillLane'] = test['blueFirstTowerLane'] + test['redFirstTowerLane']
test.drop(columns = ['blueFirstTowerLane', 'redFirstTowerLane'], inplace = True)
print('test Data Shape: ', test.shape)

Train Data Shape:  (19693, 54)
test Data Shape:  (6571, 54)


In [217]:
train['FirstKillLane'].head()

0    ['BOT_LANE'][]
1              [][]
2              [][]
3    []['TOP_LANE']
4              [][]
Name: FirstKillLane, dtype: object

The sum of MidTowerKills, TopTowerKills, and BotTowerKills features was expected to equal the Kill feature, but there was a difference in value. It can be seen that the kill occurred purely in the line battle. Therefore, these features are used as they are.

In [218]:
train[['blueKill', 'blueMidTowerKills', 'blueTopTowerKills', 'blueBotTowerKills']].head()

Unnamed: 0,blueKill,blueMidTowerKills,blueTopTowerKills,blueBotTowerKills
0,13,0,0,1
1,5,0,0,0
2,5,0,0,0
3,8,0,0,0
4,4,0,0,0


In [219]:
tower_encoder = OneHotEncoder()
tower_cat = tower_encoder.fit_transform(train['FirstKillLane'].values.reshape(-1, 1))
tower_cat_cols = tower_encoder.get_feature_names('T')
tower_df = pd.DataFrame(tower_cat.todense(), columns = tower_cat_cols)

train = pd.concat([train.drop(columns = 'FirstKillLane'), tower_df], axis = 1)
train.head()

Unnamed: 0,gameId,blueTotalGolds,blueCurrentGolds,blueTotalLevel,blueAvgLevel,blueTotalMinionKills,blueTotalJungleMinionKills,blueFirstBlood,blueKill,blueDeath,...,D_[]['FIRE_DRAGON'],D_[]['WATER_DRAGON'],D_[][],T_['BOT_LANE'][],T_['MID_LANE'][],T_['TOP_LANE'][],T_[]['BOT_LANE'],T_[]['MID_LANE'],T_[]['TOP_LANE'],T_[][]
0,4204710612,19975,3374,35,7.0,216,38,0,13,6,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4153551749,15892,4012,36,7.2,215,57,0,5,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4139787761,16270,2485,35,7.0,227,56,0,5,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4202227127,16605,3290,34,6.8,194,28,0,8,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4189344068,14980,665,33,6.6,199,56,0,4,13,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [220]:
tower_cat = tower_encoder.fit_transform(test['FirstKillLane'].values.reshape(-1, 1))
tower_cat_cols = tower_encoder.get_feature_names('T')
tower_df = pd.DataFrame(tower_cat.todense(), columns = tower_cat_cols)

test = pd.concat([test.drop(columns = 'FirstKillLane'), tower_df], axis = 1)
test.head()

Unnamed: 0,gameId,blueTotalGolds,blueCurrentGolds,blueTotalLevel,blueAvgLevel,blueTotalMinionKills,blueTotalJungleMinionKills,blueFirstBlood,blueKill,blueDeath,...,D_[]['FIRE_DRAGON'],D_[]['WATER_DRAGON'],D_[][],T_['BOT_LANE'][],T_['MID_LANE'][],T_['TOP_LANE'][],T_[]['BOT_LANE'],T_[]['MID_LANE'],T_[]['TOP_LANE'],T_[][]
0,4179175415,30612,15812,49,9.8,111,0,0,11,10,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,4216106860,16318,1623,34,6.8,242,43,0,6,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4181519175,18743,3093,37,7.4,240,53,0,9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4240705476,17890,3815,36,7.2,241,52,0,9,4,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4230835112,16334,2789,34,6.8,220,48,0,5,6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# One Step Function

In [221]:
def PreprocessData(train_data, test_data):
    train = DerivedFeature_KDA(train_data)
    test = DerivedFeature_KDA(test_data)
    
    # Remove observations that cannot occur
    train = train.loc[(train['blueDragnoType'].isin(true_value)) & (train['redDragnoType'].isin(true_value))]
    train = train.loc[~((train['blueDragnoType'] != "[]") &(train['redDragnoType'] != '[]'))].reset_index(drop = True)
    
    test = test.loc[(test['blueDragnoType'].isin(true_value)) & (test['redDragnoType'].isin(true_value))]
    test = test.loc[~((test['blueDragnoType'] != "[]") &(test['redDragnoType'] != '[]'))].reset_index(drop = True)
    
    # Create Interactive Categorical Feature
    ## About Train Dataset
    ### Dragon Kill
    train['dragonKill'] = train['blueDragnoType'] + train['redDragnoType']
    train.drop(columns = ['blueDragnoType', 'redDragnoType', 'blueDragon', 'blueFirstDragon', 'redDragon', 'redFirstDragon'], inplace = True)

    dragon_encoder = OneHotEncoder()
    dragon_cat = dragon_encoder.fit_transform(train['dragonKill'].values.reshape(-1, 1))
    dragon_cat_cols = dragon_encoder.get_feature_names('D')
    dragon_df = pd.DataFrame(dragon_cat.todense(), columns = dragon_cat_cols)

    train = pd.concat([train.drop(columns = 'dragonKill'), dragon_df], axis = 1)

    ### Tower Kill
    train['FirstKillLane'] = train['blueFirstTowerLane'] + train['redFirstTowerLane']
    train.drop(columns = ['blueFirstTowerLane', 'redFirstTowerLane'], inplace = True)
    
    tower_encoder = OneHotEncoder()
    tower_cat = tower_encoder.fit_transform(train['FirstKillLane'].values.reshape(-1, 1))
    tower_cat_cols = tower_encoder.get_feature_names('T')
    tower_df = pd.DataFrame(tower_cat.todense(), columns = tower_cat_cols)

    train = pd.concat([train.drop(columns = 'FirstKillLane'), tower_df], axis = 1)
    
    ## About Test Dataset
    ### Dragon Kill
    test['dragonKill'] = test['blueDragnoType'] + test['redDragnoType']
    test.drop(columns = ['blueDragnoType', 'redDragnoType', 'blueDragon', 'blueFirstDragon', 'redDragon', 'redFirstDragon'], inplace = True)
    
    dragon_cat = dragon_encoder.transform(test['dragonKill'].values.reshape(-1, 1))
    dragon_cat_cols = dragon_encoder.get_feature_names('D')
    dragon_df = pd.DataFrame(dragon_cat.todense(), columns = dragon_cat_cols)

    test = pd.concat([test.drop(columns = 'dragonKill'), dragon_df], axis = 1)
    
    ### Tower Kill
    tower_cat = tower_encoder.fit_transform(test['FirstKillLane'].values.reshape(-1, 1))
    tower_cat_cols = tower_encoder.get_feature_names('T')
    tower_df = pd.DataFrame(tower_cat.todense(), columns = tower_cat_cols)

    test = pd.concat([test.drop(columns = 'FirstKillLane'), tower_df], axis = 1)
    
    print("Preprocessing Train Data Shape: ", train,shape)
    print("Preprocessing Test Data Shape: ", test.shape)
    return train, test
    

# Post Processing
As can be seen in the exploratory data analysis, the XGBoost algorithm generates errors when special characters except '[]' are column names. Therefore, we need to convert the column names.

In [224]:
regex = re.compile(r"\[]", re.IGNORECASE)
train.columns = [regex.sub("_None_", col) if any(x in str(col) for x in set(('[]'))) else col for col in train.columns.values]
test.columns = [regex.sub("_None_", col) if any(x in str(col) for x in set(('[]'))) else col for col in test.columns.values]

regex = re.compile(r"\[|\]", re.IGNORECASE)
train.columns = [regex.sub("", col) if any(x in str(col) for x in set(('[]'))) else col for col in train.columns.values]
test.columns = [regex.sub("", col) if any(x in str(col) for x in set(('[]'))) else col for col in test.columns.values]

# Export Output

In [226]:
EXPORT_PATH = '../dataset/preprocessing'
train.to_csv(EXPORT_PATH + '/Preprocess_train.csv', index= False)
test.to_csv(EXPORT_PATH + '/Preprocess_test.csv', index = False)