In [1]:
# more the detail for /results/report.pdf

# import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
%matplotlib inline

In [2]:
# combine something.csv and something_add.csv into somthing_full.csv
for filename in 'train','condition':
  path_f = '../data/raw_data/' + filename + '.csv'
  path_f_add = '../data/raw_data/' + filename + '_add.csv'
  path_f_full = '../data/rearranged_data/' + filename + '_full.csv'
  with open(path_f,'r',encoding='utf-8') as f:
    with open(path_f_full,'w',encoding='utf-8') as f_full:
      str_f = f.read()
      f_full.write(str_f)
  with open(path_f_add,'r',encoding='utf-8') as f_add:
    with open (path_f_full,'a',encoding='utf-8') as f_full:
      strs_f_add = f_add.readlines()
      f_full.writelines(strs_f_add[1:])    

In [3]:
# load data and make pd.dataframe
df_train = pd.read_csv('../data/rearranged_data/train_full.csv')
df_cond = pd.read_csv('../data/rearranged_data/condition_full.csv')
df_test = pd.read_csv('../data/raw_data/test.csv')
df_stad = pd.read_csv('../data/raw_data/stadium.csv')

In [4]:
# check the data
print(df_train.info())
print(df_test.info())
print(df_stad.info())
print(df_cond.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 11 columns):
id         1953 non-null int64
y          1953 non-null int64
year       1953 non-null int64
stage      1953 non-null object
match      1953 non-null object
gameday    1953 non-null object
time       1953 non-null object
home       1953 non-null object
away       1953 non-null object
stadium    1953 non-null object
tv         1953 non-null object
dtypes: int64(3), object(8)
memory usage: 167.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313 entries, 0 to 312
Data columns (total 10 columns):
id         313 non-null int64
year       313 non-null int64
stage      313 non-null object
match      313 non-null object
gameday    313 non-null object
time       313 non-null object
home       313 non-null object
away       313 non-null object
stadium    313 non-null object
tv         313 non-null object
dtypes: int64(2), object(8)
memory usage: 24.5+ KB
None
<class 'pandas.cor

In [5]:
# concatenate df_train with df_test
df = pd.concat([df_train,df_test]).reset_index(drop=True)

# delete rows of df_cond whose id doesn't exist in df_train/df_test's
df_cond = pd.merge(df.loc[:,['id']], df_cond, on='id', how='left')
df_cond.shape

(2266, 31)

In [6]:
# check the min and max of time of df_train/df_test
min_time = min(min(df_train.time),min(df_test.time))
max_time = max(max(df_train.time),max(df_test.time))
print('min =',min_time)
print('max =',max_time)

def convert_time(time):
  min_len = 60*int(min_time[:2]) + int(min_time[-2:])
  max_len = 60*int(max_time[:2]) + int(max_time[-2:])
  return ( 60*int(time[:2]) + int(time[-2:]) - min_len ) / (max_len - min_len)

min = 12:33
max = 20:04


In [7]:
encoded_tv = df['tv'].str.split('／',expand=True).reset_index(drop=True)
encoded_tv[0].unique()

array(['スカパー', 'スカパー！'], dtype=object)

In [8]:
# mapping and generating dummy variables 


# [df]
# stage
df['stage'] = df['stage'].map({'Ｊ１':1, 'Ｊ２':0})

# match
df['match'] = df['match'].map(lambda x: int(x[-2]) - 1)

# gameday
df['gameday'] = df['gameday'].str[6:-1]
encoded_gameday = pd.get_dummies(df['gameday'], drop_first=True)

# time
df['time'] = df['time'].map(convert_time)

# stadium
encoded_stadium = pd.get_dummies(df['stadium'], drop_first=True)

# tv
del encoded_tv[0] # because I knew as above that encoded_tv[0] has no info. 
encoded_tv = pd.get_dummies(encoded_tv)
modified_columns = {col:col[2:] for col in encoded_tv.columns}
encoded_tv.rename(columns=modified_columns, inplace=True)
encoded_tv = encoded_tv.groupby(level=0, axis=1).sum()

# [df_cond]
# weather
encoded_weather = pd.get_dummies(df_cond['weather'], drop_first=True)

# humidity
df_cond['humidity'] = df_cond['humidity'].map(lambda x: 0.01*float(x[:-1]))

# referee
encoded_referee = pd.get_dummies(df_cond['referee'], drop_first=True)

# players
df_players = df_cond.iloc[:,7:]
encoded_players = pd.get_dummies(df_players, drop_first=True)
modified_columns = {col:col[8:] for col in encoded_players.columns}
encoded_players.rename(columns=modified_columns, inplace=True)
encoded_players = encoded_players.groupby(level=0, axis=1).sum()

In [9]:
# PCA with feature values
# [df]
# encoded_gameday
# encoded_stadium
# encoded_tv
# [df_cond]
# encoded_weather
# encoded_referee
# encoded_players
lis_encodeds = [encoded_gameday,encoded_stadium,encoded_tv,encoded_weather,encoded_referee,encoded_players]
encoded_stds = []
pca_encodeds = []
encoded_pcas = []
ss = StandardScaler()
for encoded in lis_encodeds:
  encoded = encoded.values
  encoded_stds.append(ss.fit_transform(encoded))
  pca_encodeds.append(PCA(n_components= 0.8))
  encoded_pcas.append(pca_encodeds[-1].fit_transform(encoded))

for pca in pca_encodeds:
  print(pca.n_components_)



4
31
15
3
26
186


In [10]:
# select the useful non-mapped and non-encoded data
df =  df.drop(['away','gameday','home','stadium','tv','year'],axis=1)
df_cond = df_cond.loc[:,['temperature','humidity']]
df_stad = df_stad.drop('address',axis=1)

In [27]:
# make np.array of y and X_train/X_test
num_train = df_train.shape[0]
y = df[:num_train].loc[:,['y']].values.ravel()
X = df.loc[:,['match','stage','time']].values
X = np.hstack((X,df_cond.values))
X = ss.fit_transform(X)
for encoded_pca in encoded_pcas:
  X = np.hstack((X, encoded_pca))
  
X_train = X[:num_train,]
X_test = X[num_train:,]

In [28]:
# All the above is preprocessing of the data
# Now move on data analysis

In [None]:
kf = KFold(n_splits=4, shuffle=True, random_state=0)
svr = SVR()
param_grid = {'kernel':['rbf'],
              'C':[1e6,1e7],
              'gamma':[1e-2,1e-3]}
gs_svr = GridSearchCV(svr, param_grid, cv=kf, scoring='neg_mean_squared_error')
gs_svr.fit(X_train, y)
print(gs_svr.best_params_)
y_pred = gs_svr.predict(X_test)

In [26]:
arr_submit = np.hstack((df_test.loc[:,['id']].values, np.vstack(y_pred)))
df_submit = pd.DataFrame(arr_submit,columns=['id','y_pred'])
df_submit['id'] = df_submit['id'].astype('int16')
df_submit

Unnamed: 0,id,y_pred
0,15822,9639.444316
1,15823,14793.050570
2,15824,34698.178215
3,15825,10713.625445
4,15827,25657.588943
5,15828,16914.140553
6,15829,5077.137626
7,15830,15136.998304
8,15831,9808.522106
9,15832,25228.803269


In [64]:
# consider stadium capa
df_submit = pd.merge(df_submit, df_test.loc[:,['id','stadium']], on='id',how='left')
df_submit.rename(columns={'stadium':'name'}, inplace=True)
df_submit = pd.merge(df_submit, df_stad, on='name', how='left')
df_submit['y_pred'] = df_submit.loc[:,['y_pred','capa']].min(axis=1)
df_submit = df_submit.drop(['name','capa'], axis=1)

In [65]:
# save the predicted data as csvfile
df_submit.to_csv('../results/y_pred.csv',index=False,header=False,encoding='utf-8')