In [1]:
import pandas as pd
import numpy as np

In [2]:
trainData = pd.read_csv('train.csv')
testData = pd.read_csv('test.csv')

In [3]:
trainData.head()

Unnamed: 0,Won,Cluster ID,Game Mode,Game Type,Hero 1,Hero 2,Hero 3,Hero 4,Hero 5,Hero 6,...,Hero 105,Hero 106,Hero 107,Hero 108,Hero 109,Hero 110,Hero 111,Hero 112,Hero 113,Game ID
0,0,223,2.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,152,2.0,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,1
2,1,131,2.0,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,2
3,1,154,2.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,171,2.0,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,4


### PRE-PROCESSING

In [4]:
# Checking for columns that have 'object' dtype
# Need to explore these columns later to check if they have disguised missing values
for col in trainData.columns:
    if trainData[col].dtype == 'object':
        print(col)

Game Type
Hero 7


In [5]:
# Checking for columns that have any null/NaN values
# Need to explore these columns later to take care of NaN values
for col in trainData.columns:
    if trainData[col].isna().any():
        print(col)

Game Mode


##### Target 'Won'

In [6]:
# Exploring the target variable
trainData['Won'].value_counts()

 1    48718
 0    43805
-1      127
Name: Won, dtype: int64

In [7]:
# Given problem says 'Won' should be either 1 or 0, hence -1 is a missing value
# As gameplays which do not have any meaningful target value cannot contribute to training, we remove these rows 
trainData = trainData[trainData['Won'] != -1]

In [8]:
trainData['Won'].value_counts()

1    48718
0    43805
Name: Won, dtype: int64

##### 'Game Mode'

In [9]:
# Checking distribution of 'Game Mode'
trainData['Game Mode'].value_counts()

2.0    71591
9.0    10017
8.0     8294
1.0     1229
6.0      497
3.0      386
4.0      206
7.0       81
5.0       12
Name: Game Mode, dtype: int64

In [10]:
trainData['Game Mode'].isna().any()

True

In [11]:
# As 'Game Mode' is a categorical variable, need to replace missing values with one of the categories 1-9
# As mode of remaining data (=2) has a pretty high frequency, we can safely replace missing values with 2
trainData['Game Mode'].fillna(value = trainData['Game Mode'].mode()[0], inplace=True)

In [12]:
trainData['Game Mode'].value_counts()

2.0    71801
9.0    10017
8.0     8294
1.0     1229
6.0      497
3.0      386
4.0      206
7.0       81
5.0       12
Name: Game Mode, dtype: int64

In [13]:
trainData['Game Mode'].dtype

dtype('float64')

In [14]:
# Changing dtype to int for convenience
trainData['Game Mode'] = trainData['Game Mode'].astype('int64')

##### 'Game Type'

In [15]:
# Checking for disguised missing values
trainData['Game Type'].value_counts()

2    56418
3    35604
?      486
1       15
Name: Game Type, dtype: int64

In [16]:
# As 'Game Type' is a categorical variable, need to replace missing values with one of the categories 1-3
# As mode of remaining data (=2) has a pretty high frequency, we can safely replace missing values with 2
trainData['Game Type'].replace(to_replace = '?', value = trainData['Game Type'].mode()[0], inplace=True)

In [17]:
trainData['Game Type'].dtype

dtype('O')

In [18]:
# Changing dtype to numeric
trainData['Game Type'] = trainData['Game Type'].astype('int64')

##### 'Hero 7'

In [19]:
trainData['Hero 7'].value_counts()

0     80619
-1     5606
1      5577
?       721
Name: Hero 7, dtype: int64

In [20]:
# Only 'Hero 7' of all the Hero features has some missing values.
# It is given in the question that for each row, there should be 5 1's and 5 -1's from Hero 1 to Hero 113.
# We check this for each row and replace missing value of 'Hero 7' accordingly.
# Using domain knowledge this way (and not resorting to simple replacement with mode/median/bfill/ffill)
# allows us to obtain the true original values of 'Hero 7' without any error.

for ind,row in trainData.iterrows():
    
    cntY = 0
    cntN = 0
    
    if row['Hero 7'] == '?':
        for i in range(1, 114):
            colName = 'Hero ' + str(i)
            if row[colName] == 1:
                cntY += 1
            elif row[colName] == -1:
                cntN += 1
        
        if cntY < 5:
            trainData.loc[ind, 'Hero 7'] = 1
        elif cntN < 5:
            trainData.loc[ind, 'Hero 7'] = -1
        else:
            trainData.loc[ind, 'Hero 7'] = 0

In [21]:
trainData['Hero 7'].value_counts()

0     80619
-1     5606
1      5577
0       644
-1       39
1        38
Name: Hero 7, dtype: int64

In [22]:
# Converting to dtype int
trainData['Hero 7'] = trainData['Hero 7'].astype('int64')

In [23]:
trainData['Hero 7'].value_counts()

 0    81263
-1     5645
 1     5615
Name: Hero 7, dtype: int64

##### Scaling 'Cluster ID'

In [24]:
trainData['Cluster ID'].value_counts()

227    7865
154    7003
156    6889
151    6838
153    6744
152    6706
155    6606
224    5948
231    3662
223    3496
225    2639
232    1703
204    1463
182    1376
187    1256
145    1239
144    1201
188    1190
181    1183
171    1118
186    1086
184     973
185     968
183     941
191     921
111     837
192     827
112     804
133     762
131     756
132     749
121     714
138     694
124     664
123     663
122     660
134     645
137     615
136     612
135     604
261     261
161     244
251     154
241     126
213      89
211      18
212      11
Name: Cluster ID, dtype: int64

In [25]:
trainData['Cluster ID'].describe()

count    92523.000000
mean       175.858900
std         35.657784
min        111.000000
25%        152.000000
50%        156.000000
75%        223.000000
max        261.000000
Name: Cluster ID, dtype: float64

In [26]:
# From above, 'Cluster ID' is not unique to each gameplay, hence should not be discarded on the misconception that 
# it is an ID for each gameplay.
# Also, 'Cluster ID' ranges from 111 to 261, hence needs to be scaled as other features range from -1 to 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

trainData['Cluster ID'] = scaler.fit_transform(np.reshape(trainData['Cluster ID'].values, newshape=(-1, 1)))
testData['Cluster ID'] = scaler.fit_transform(np.reshape(testData['Cluster ID'].values, newshape=(-1, 1)))



##### OHE categorical variables

In [27]:
# Both 'Game Type' and 'Game Mode' are categorical variables, need to be one hot encoded
trainData = pd.get_dummies(trainData, columns=['Game Type', 'Game Mode'], drop_first=True)
testData = pd.get_dummies(testData, columns=['Game Type', 'Game Mode'], drop_first=True)

##### Removing redundant columns

In [28]:
# Remove columns from dataset that have only one single value throughout as they do not contribute to training
dropCols = []
for col in trainData.columns:
    if len(trainData[col].unique()) == 1:
        print(col)
        dropCols.append(col)

Hero 24
Hero 108


In [29]:
trainData = trainData.drop(dropCols, axis=1)
testData = testData.drop(dropCols, axis=1)

##### Splitting features and targets

In [30]:
# Drop 'Game ID' as it only serves to identify each gameplay and not for training
x_train = trainData.drop(['Won', 'Game ID'], axis=1)
y_train = trainData['Won']

In [31]:
x_train.head()

Unnamed: 0,Cluster ID,Hero 1,Hero 2,Hero 3,Hero 4,Hero 5,Hero 6,Hero 7,Hero 8,Hero 9,...,Game Type_2,Game Type_3,Game Mode_2,Game Mode_3,Game Mode_4,Game Mode_5,Game Mode_6,Game Mode_7,Game Mode_8,Game Mode_9
0,0.746667,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,0.273333,0,0,0,1,0,-1,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,0.133333,0,0,0,1,0,-1,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0.286667,0,0,0,0,0,0,-1,0,0,...,1,0,1,0,0,0,0,0,0,0
4,0.4,0,0,0,0,0,-1,0,0,-1,...,0,1,1,0,0,0,0,0,0,0


In [32]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Won, dtype: int64

In [33]:
testData.head()

Unnamed: 0,Cluster ID,Hero 1,Hero 2,Hero 3,Hero 4,Hero 5,Hero 6,Hero 7,Hero 8,Hero 9,...,Game Type_2,Game Type_3,Game Mode_2,Game Mode_3,Game Mode_4,Game Mode_5,Game Mode_6,Game Mode_7,Game Mode_8,Game Mode_9
0,0.746667,0,-1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,0.773333,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
2,0.166667,1,0,0,0,-1,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0
3,0.773333,-1,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0
4,0.486667,0,0,0,-1,0,0,0,-1,0,...,0,1,1,0,0,0,0,0,0,0


### TRAINING MODEL

In [34]:
from keras.layers import Dense, BatchNormalization, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [35]:
# Setting up a very shallow network of 4 neurons in 1 hidden layer
# Intuition - 1 neuron each for effect of Hero Types, Cluster ID, Game Type, Game Mode

# Can tweak around the network architecture adding different layers with different regularization techniques

model = Sequential()
model.add(Dense(4, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

In [36]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
# Using early stopping to stop if validation accuracy does not increase for 5 epochs
# Saving the best model via ModelCheckpoint

model.fit(x_train.values, y_train.values, epochs=20, validation_split=0.2, 
          callbacks=[EarlyStopping(monitor='val_acc', patience=5), 
                     ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)])

Train on 74018 samples, validate on 18505 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


<keras.callbacks.History at 0x228940814e0>

In [38]:
# Loading weights of best model which was saved
model.load_weights(filepath='best_model.h5')

In [39]:
# Gives classes directly
y_pred = model.predict_classes(testData.drop(['Game ID'], axis=1))

In [40]:
# Distribution of output
np.unique(y_pred, return_counts=True)

(array([0, 1]), array([4212, 6082], dtype=int64))

In [41]:
subData = pd.DataFrame(testData['Game ID'])
subData['Won'] = y_pred

In [42]:
subData.to_csv('Submission.csv', index=False)

In [43]:
pd.read_csv('Submission.csv').head()

Unnamed: 0,Game ID,Won
0,0,0
1,1,1
2,2,1
3,3,1
4,4,0


The above method gives around 0.5999 accuracy. Hence, well pre-processed data trained on even a very shallow network can give good results.