# Vol 0.1.0 LSTM model

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

## 1.Read the data

In [2]:
match_data = pd.read_csv('../ref/Top_1000_teams_matches.csv', index_col=0)
heroes_id = pd.read_csv('../ref/heroes_id.csv', index_col=0, header=None)
teams_id = pd.read_csv('../ref/teams_id.csv', index_col=0, header=None)
# Shuffle the match data
match_data = match_data.sample(frac=1, random_state=101).reset_index(drop=True)
y = match_data.iloc[:, -4:-1]
X = match_data.drop(['Radiant_win', 'Radiant_fb', 'Radiant_10kill'], axis=1)
print('total # of matches: {}'.format(match_data.shape[0]))

total # of matches: 17787


In [3]:
match_data.head()

Unnamed: 0,Radiant_team_id,Radiant_team_rating,Dire_team_id,Dire_team_rating,Radiant_hero1,Radiant_hero2,Radiant_hero3,Radiant_hero4,Radiant_hero5,Dire_hero1,Dire_hero2,Dire_hero3,Dire_hero4,Dire_hero5,Radiant_win,Radiant_fb,Radiant_10kill,start_time
0,3331948,1203.45,3325252,1000.0,97,8,30,62,65,86,2,110,18,74,True,True,True,1489206014
1,111474,1084.27,2006913,1291.81,69,31,39,26,73,91,19,3,28,78,False,True,True,1444935417
2,5326717,1035.27,4425117,1073.52,57,86,3,8,11,19,78,87,39,88,False,True,False,1521978605
3,2202484,1000.0,3332295,1215.38,57,48,3,51,47,62,86,108,70,35,False,True,False,1481731068
4,2780911,1181.1,3672381,1000.0,83,104,112,1,47,16,5,70,55,74,True,True,True,1494158848


In [4]:
y.head()

Unnamed: 0,Radiant_win,Radiant_fb,Radiant_10kill
0,True,True,True
1,False,True,True
2,False,True,False
3,False,True,False
4,True,True,True


In [5]:
X.head()

Unnamed: 0,Radiant_team_id,Radiant_team_rating,Dire_team_id,Dire_team_rating,Radiant_hero1,Radiant_hero2,Radiant_hero3,Radiant_hero4,Radiant_hero5,Dire_hero1,Dire_hero2,Dire_hero3,Dire_hero4,Dire_hero5,start_time
0,3331948,1203.45,3325252,1000.0,97,8,30,62,65,86,2,110,18,74,1489206014
1,111474,1084.27,2006913,1291.81,69,31,39,26,73,91,19,3,28,78,1444935417
2,5326717,1035.27,4425117,1073.52,57,86,3,8,11,19,78,87,39,88,1521978605
3,2202484,1000.0,3332295,1215.38,57,48,3,51,47,62,86,108,70,35,1481731068
4,2780911,1181.1,3672381,1000.0,83,104,112,1,47,16,5,70,55,74,1494158848


## 2.One-hot encoding team_id and hero_id

In [6]:
def build_oh_encoder(X):
    label_encoder = LabelEncoder().fit(X.ravel())
    interger_encoded = label_encoder.transform(X.ravel()).reshape(-1,1)
    oh_encoder = OneHotEncoder().fit(interger_encoded)
    return label_encoder, oh_encoder

def oh_encode(X, oh_encoder, label_encoder):
    interger_encoded = label_encoder.transform(X.ravel()).reshape(-1,1)
    oh_encoded = oh_encoder.transform(interger_encoded)
    return oh_encoded

def oh_decode(X_encoded, label_encoder):
    return label_encoder.inverse_transform([np.argmax(X_encoded.todense()[0, :])])

In [7]:
label_encoder_team, oh_encoder_team = build_oh_encoder(teams_id.values)
label_encoder_hero, oh_encoder_hero = build_oh_encoder(heroes_id.values)

In [8]:
X['Radiant_team_id'] = oh_encode(X['Radiant_team_id'].values.reshape(-1,1), oh_encoder_team, label_encoder_team)
X['Dire_team_id'] = oh_encode(X['Dire_team_id'].values.reshape(-1,1), oh_encoder_team, label_encoder_team)
for hero_num in range(1,6):
    X['Radiant_hero'+str(hero_num)] = oh_encode(X['Radiant_hero'+str(hero_num)].values.reshape(-1, 1), 
                                                oh_encoder_hero, label_encoder_hero) 
    X['Dire_hero'+str(hero_num)] = oh_encode(X['Dire_hero'+str(hero_num)].values.reshape(-1, 1), 
                                                oh_encoder_hero, label_encoder_hero)
print('Finish One-hot encoding features.')

[0] Finish One-hot encoding features.


## 3.Split data into training and testing dataset

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=102)

## 4.Select hero feature and non-hero feature

In [21]:
def get_hero_features(X):
    X_hero = X.loc[:, ['Radiant_hero1', 'Radiant_hero2', 'Radiant_hero3', 
                       'Radiant_hero4', 'Radiant_hero5', 'Dire_hero1', 
                       'Dire_hero2', 'Dire_hero3', 'Dire_hero4', 'Dire_hero5']]
    return X_hero

def get_non_hero_features(X):
    X_non_hero = X.loc[:, ['Radiant_team_id', 'Radiant_team_rating', 'Dire_team_id', 
                           'Dire_team_rating', 'start_time']]
    return X_non_hero

In [22]:
X_train_hero = get_hero_features(X_train)
X_train_non_hero = get_non_hero_features(X_train)
X_test_hero = get_hero_features(X_test)
X_test_non_hero = get_non_hero_features(X_test)
print('X_train_hero.shape:     ({}, {})'.format(X_train_hero.shape[0], X_train_hero.shape[1]))
print('X_train_non_hero.shape: ({}, {})'.format(X_train_non_hero.shape[0], X_train_non_hero.shape[1]))
print('X_test_hero.shape:      ({}, {})'.format(X_test_hero.shape[0], X_test_hero.shape[1]))
print('X_test_non_hero.shape:  ({}, {})'.format(X_test_non_hero.shape[0], X_test_non_hero.shape[1]))
print('y_train.shape:          ({}, {})'.format(y_train.shape[0], y_train.shape[1]))
print('y_test.shape:           ({}, {})'.format(y_test.shape[0], y_test.shape[1]))

X_train_hero.shape:     (14229, 10)
X_train_non_hero.shape: (14229, 5)
X_test_hero.shape:      (3558, 10)
X_test_non_hero.shape:  (3558, 5)
y_train.shape:          (14229, 3)
y_test.shape:           (3558, 3)


In [23]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
heroes_input = Input(shape=(10,), name='heroes_input')


In [25]:
X_train_hero.iloc[0,0]

<17787x115 sparse matrix of type '<class 'numpy.float64'>'
	with 17787 stored elements in Compressed Sparse Row format>