In [55]:
# more the detail for /results/report.pdf

# import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
%matplotlib inline

import chainer
import chainer.functions as F
import chainer.links as L
from chainer import computational_graph
from chainer import serializers
from chainer.datasets import tuple_dataset
from chainer import iterators, training
from chainer.training import extensions
from pylab import box

In [56]:
# combine something.csv and something_add.csv into somthing_full.csv
for filename in 'train','condition':
  path_f = '../data/raw_data/' + filename + '.csv'
  path_f_add = '../data/raw_data/' + filename + '_add.csv'
  path_f_full = '../data/rearranged_data/' + filename + '_full.csv'
  with open(path_f,'r',encoding='utf-8') as f:
    with open(path_f_full,'w',encoding='utf-8') as f_full:
      str_f = f.read()
      f_full.write(str_f)
  with open(path_f_add,'r',encoding='utf-8') as f_add:
    with open (path_f_full,'a',encoding='utf-8') as f_full:
      strs_f_add = f_add.readlines()
      f_full.writelines(strs_f_add[1:])    

In [103]:
# load data and make pd.dataframe
df_train = pd.read_csv('../data/rearranged_data/train_full.csv')
df_cond = pd.read_csv('../data/rearranged_data/condition_full.csv')
df_test = pd.read_csv('../data/raw_data/test.csv')
df_stad = pd.read_csv('../data/raw_data/stadium.csv')

In [104]:
# check the data
print(df_train.info())
print(df_test.info())
print(df_stad.info())
print(df_cond.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 11 columns):
id         1953 non-null int64
y          1953 non-null int64
year       1953 non-null int64
stage      1953 non-null object
match      1953 non-null object
gameday    1953 non-null object
time       1953 non-null object
home       1953 non-null object
away       1953 non-null object
stadium    1953 non-null object
tv         1953 non-null object
dtypes: int64(3), object(8)
memory usage: 167.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313 entries, 0 to 312
Data columns (total 10 columns):
id         313 non-null int64
year       313 non-null int64
stage      313 non-null object
match      313 non-null object
gameday    313 non-null object
time       313 non-null object
home       313 non-null object
away       313 non-null object
stadium    313 non-null object
tv         313 non-null object
dtypes: int64(2), object(8)
memory usage: 24.5+ KB
None
<class 'pandas.cor

In [105]:
# concatenate df_train with df_test
df = pd.concat([df_train,df_test]).reset_index(drop=True)

# delete rows of df_cond whose id doesn't exist in df_train/df_test's
df_cond = pd.merge(df.loc[:,['id']], df_cond, on='id', how='left')
df_cond.shape

(2266, 31)

In [106]:
# check the min and max of time of df_train/df_test
min_time = min(min(df_train.time),min(df_test.time))
max_time = max(max(df_train.time),max(df_test.time))
print('min =',min_time)
print('max =',max_time)

def convert_time(time):
  min_len = 60*int(min_time[:2]) + int(min_time[-2:])
  max_len = 60*int(max_time[:2]) + int(max_time[-2:])
  return ( 60*int(time[:2]) + int(time[-2:]) - min_len ) / (max_len - min_len)

min = 12:33
max = 20:04


In [107]:
encoded_tv = df['tv'].str.split('／',expand=True).reset_index(drop=True)
encoded_tv[0].unique()

array(['スカパー', 'スカパー！'], dtype=object)

In [108]:
# mapping and generating dummy variables 


# [df]
# stage
df['stage'] = df['stage'].map({'Ｊ１':1, 'Ｊ２':0})

# match
df['match'] = df['match'].map(lambda x: int(x[-2]) - 1)

# gameday
df['gameday'] = df['gameday'].str[6:-1]
encoded_gameday = pd.get_dummies(df['gameday'], drop_first=True)

# time
df['time'] = df['time'].map(convert_time)

# stadium
encoded_stadium = pd.get_dummies(df['stadium'], drop_first=True)

# tv
del encoded_tv[0] # because I knew as above that encoded_tv[0] has no info. 
encoded_tv = pd.get_dummies(encoded_tv)
modified_columns = {col:col[2:] for col in encoded_tv.columns}
encoded_tv.rename(columns=modified_columns, inplace=True)
encoded_tv = encoded_tv.groupby(level=0, axis=1).sum()

# [df_cond]
# weather
encoded_weather = pd.get_dummies(df_cond['weather'], drop_first=True)

# humidity
df_cond['humidity'] = df_cond['humidity'].map(lambda x: 0.01*float(x[:-1]))

# referee
encoded_referee = pd.get_dummies(df_cond['referee'], drop_first=True)

# players
df_players = df_cond.iloc[:,7:]
encoded_players = pd.get_dummies(df_players, drop_first=True)
modified_columns = {col:col[8:] for col in encoded_players.columns}
encoded_players.rename(columns=modified_columns, inplace=True)
encoded_players = encoded_players.groupby(level=0, axis=1).sum()

In [109]:
# PCA with feature values
# [df]
# encoded_gameday
# encoded_stadium
# encoded_tv
# [df_cond]
# encoded_weather
# encoded_referee
# encoded_players
lis_encodeds = [encoded_gameday,encoded_stadium,encoded_tv,encoded_weather,encoded_referee,encoded_players]
encoded_stds = []
pca_encodeds = []
encoded_pcas = []
ss = StandardScaler()
for encoded in lis_encodeds:
  encoded = encoded.values
  encoded_stds.append(ss.fit_transform(encoded))
  pca_encodeds.append(PCA(n_components= 0.8))
  encoded_pcas.append(pca_encodeds[-1].fit_transform(encoded))

for pca in pca_encodeds:
  print(pca.n_components_)



4
31
15
3
26
186


In [110]:
# select the useful non-mapped and non-encoded data
df =  df.drop(['away','gameday','home','stadium','tv','year'],axis=1)
df_cond = df_cond.loc[:,['temperature','humidity']]
df_stad = df_stad.drop('address',axis=1)

In [118]:
# make np.array of y and X_train/X_test
num_train = df_train.shape[0]
y = df[:num_train].loc[:,['y']].values
X = df.loc[:,['match','stage','time']].values
X = np.hstack((X,df_cond.values))
X = ss.fit_transform(X)
for encoded_pca in encoded_pcas:
  X = np.hstack((X, encoded_pca))
X = X.astype(np.float32)
y = y.astype(np.float32).flatten()
X_cal = X[:num_train,]
X_submit = X[num_train:,]

X_train, X_test, y_train, y_test \
= train_test_split(X_cal, y, test_size=0.3, random_state=0)

y_train.shape

(1367,)

In [119]:
# All the above is preprocessing of the data
# Now move on data analysis

In [120]:
class MLP(chainer.Chain):
  def __init__(self):
    super(MLP, self).__init__()
    with self.init_scope():
      self.l1 = L.Linear(270, 350)
      self.l2 = L.Linear(350, 200)
      self.l3 = L.Linear(200, 50)
      self.l4 = L.Linear(50, 1)

  def __call__(self, X):
    h1 = F.relu(self.l1(X))
    h2 = F.relu(self.l2(h1))
    h3 = F.relu(self.l3(h2))
    return self.l4(h3)

# model = L.Classifier(MLP(), lossfun = F.mean_squared_error)
model = MLP()
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
batch_size = 100
n_epoch = 20
train = tuple_dataset.TupleDataset(X_train, y_train)
train_iter = iterators.SerialIterator(train, batch_size=batch_size, shuffle=True)
updater = training.StandardUpdater(train_iter, optimizer, loss_func=F.mean_squared_error,device=-1)
trainer = training.Trainer(updater, (n_epoch,'epoch'), out='result')
test = tuple_dataset.TupleDataset(X_test, y_test)
test_iter = iterators.SerialIterator(test, batch_size=batch_size, shuffle=False, repeat=False)
trainer.extend(extensions.Evaluator(test_iter, model))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(['epoch','main/loss','main/accuracy','validation/main/loss','validation/main/accuracy']))
trainer.run()

Exception in main training loop: 
Invalid operation is performed in: MeanSquaredError (Forward)

Expect: in_types[0].shape == in_types[1].shape
Actual: (100, 270) != (100,)
Traceback (most recent call last):
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\training\trainer.py", line 299, in run
    entry.extension(self)
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\training\updater.py", line 223, in update
    converter: Converter function to build input arrays. Each batch
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\training\updater.py", line 234, in update_core
    default.
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\optimizer.py", line 541, in update
    param.update()
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\functions\loss\mean_squared_error.py", line 61, in mean_squared_error
  File "C:\Users\monad\Anaconda3\lib\site-packages\chainer\function_node.py", line 230, in apply
  File "C:\Users\monad\Anaconda3\lib\site

InvalidType: 
Invalid operation is performed in: MeanSquaredError (Forward)

Expect: in_types[0].shape == in_types[1].shape
Actual: (100, 270) != (100,)

In [None]:
def show_graph(src):
  img =  plt.imread(src)
  xpixels,ypixels = img.shape[0],img.shape[1]
  dpi = 100
  margin = 0.01
  figsize = (1 + margin) * ypixels / dpi, (1 + margin) * xpixels / dpi

  fig = plt.figure(figsize=figsize, dpi=dpi)
  ax = fig.add_axes([margin, margin, 1 - 2*margin, 1 - 2*margin])
  ax.tick_params(labelbottom="off",bottom="off")
  ax.tick_params(labelleft="off",left="off")

  ax.imshow(img, interpolation='none')
  box("off")
  plt.show()
  
show_graph('result/loss.png')
show_graph('result/accuracy.png')

In [None]:
y_pred = predict(model, X_submit)

In [466]:
arr_submit = np.hstack((df_test.loc[:,['id']].values, np.vstack(y_pred)))
df_submit = pd.DataFrame(arr_submit,columns=['id','y_pred'])
df_submit['id'] = df_submit['id'].astype('int16')
df_submit

Unnamed: 0,id,y_pred
0,15822,12418.885777
1,15823,14065.541525
2,15824,34958.781943
3,15825,13336.455359
4,15827,28797.079526
5,15828,18284.226516
6,15829,6688.491489
7,15830,15319.925497
8,15831,12669.520310
9,15832,24851.242096


In [428]:
# save the predicted data as csvfile
df_submit.to_csv('../results/y_pred_dl.csv',index=False,header=False,encoding='utf-8')