# ライブラリの読み込み

In [74]:
import pandas as pd
import numpy as np
import json
import os
import random
import string
import re

from pathlib import Path
from tqdm import tqdm

import lightgbm as lgb
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from  sklearn.neural_network import MLPRegressor
from  sklearn.pipeline import make_pipeline
from tqdm import tqdm

# データのロード
まずはデータを読み込んで見ましょう。csvデータの読み込みは複数のやり方がありえますが、pandasのread_csv関数はその中でも機能が豊富で、扱いやすいためこれを使います。
これを使うと、csvデータを読み込み、pandas.DataFrameにして返してくれます。


```
 8   pitcherHand         20355 non-null  object 右投げか左投げか
 10  batterHand          20355 non-null  object 右打ちか左打ちか
 18  dir                 3642 non-null   object 打球方向(A-Z)
 19  dist                5166 non-null   float64 打球距離
 20  battingType         3642 non-null   object 打球種類(B:バント, G:ゴロ, P:ポップフライ, F:フライ, L:ライナー)
 21  isOuts              5166 non-null   object 投球結果がアウトか

```


In [None]:
#データの読み込みと前処理

In [130]:

import pandas as pd
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
game = pd.read_csv("game_info.csv")
target = train['y']
print(train.info())

train = train.drop(['id','y'],axis=1)
game = game.drop('Unnamed: 0',axis=1)
test = test.drop('id',axis=1)
print(train.shape)
print(test.shape)
print(test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20400 entries, 0 to 20399
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  20400 non-null  int64  
 1   totalPitchingCount  20400 non-null  int64  
 2   B                   20400 non-null  int64  
 3   S                   20400 non-null  int64  
 4   O                   20400 non-null  int64  
 5   b1                  20400 non-null  bool   
 6   b2                  20400 non-null  bool   
 7   b3                  20400 non-null  bool   
 8   pitcher             20400 non-null  object 
 9   pitcherHand         20355 non-null  object 
 10  batter              20400 non-null  object 
 11  batterHand          20355 non-null  object 
 12  gameID              20400 non-null  int64  
 13  inning              20400 non-null  object 
 14  pitchType           20400 non-null  object 
 15  speed               20400 non-null  object 
 16  ball

In [131]:
train.shape,test.shape,target.shape,game.shape

((20400, 22), (33808, 13), (20400,), (726, 8))

In [132]:
TeamList = game['topTeam'].unique()
TeamDic ={}
for i in range(len(TeamList)):
    TeamDic[TeamList[i]] = i
print(TeamDic)
game['bottomTeam']=game['bottomTeam'].replace(TeamDic)
game['topTeam']=game['topTeam'].replace(TeamDic)

{'広島': 0, '中日': 1, '阪神': 2, 'ロッテ': 3, '楽天': 4, '日本ハム': 5, 'オリックス': 6, 'ソフトバンク': 7, '巨人': 8, 'DeNA': 9, 'ヤクルト': 10, '西武': 11}


In [84]:
game.head()

Unnamed: 0,startTime,bottomTeam,bgBottom,topTeam,place,startDayTime,bgTop,gameID
0,18:00,9,3,0,横浜,2020-06-19 18:00:00,6,20202173
1,18:00,10,2,1,神宮,2020-06-19 18:00:00,4,20202174
2,18:00,8,1,2,東京ドーム,2020-06-19 18:00:00,5,20202175
3,18:00,7,12,3,PayPayドーム,2020-06-19 18:00:00,9,20202170
4,18:00,6,11,4,京セラD大阪,2020-06-19 18:00:00,10,20202171


In [133]:
# 年月日、曜日、時分秒を追加
game['startDayTime'] = pd.to_datetime(game['startDayTime']) # 型を変換
game['year']=game["startDayTime"].dt.year
game['month']=game["startDayTime"].dt.month
game['day']=game["startDayTime"].dt.day
game['hour']=game["startDayTime"].dt.hour
game['dayofweek']=game["startDayTime"].dt.dayofweek
game['minute']=game["startDayTime"].dt.minute
game['second']=game["startDayTime"].dt.second
# 'startDayTime'を削除
game = game.drop(['startDayTime'],axis=1)

In [86]:
# データ保管

In [134]:
# 訓練データの球速を他の特徴量から予測できるように仮の目的変数とする。
# 欠損値は直前の値を入れて補完
target_speed = train['speed'].str.extract(r'(\d+)').fillna(method='ffill')

In [135]:
# 訓練データのみにある列名（テストデータにはない列名）のリストを作成
delcollist = []
for col in train.columns:
  if not col in test.columns:
    delcollist.append(col)
# 訓練データのみにある列名を削除
train = train.drop(delcollist,axis=1)

In [138]:
# inning を　数値に変換
import re
train['inning_num'] =  train['inning'].apply(lambda x: re.sub("\\D", "", x))
test['inning_num'] =  test['inning'].apply(lambda x: re.sub("\\D", "", x))

In [139]:

# 表裏を判定する関数
def omote_ura(x):
  if '表' in x:
    return 0
  else:
    return 1
# 表裏の列を追加
train['inning_ForB'] =  train['inning'].apply(lambda x: omote_ura(x))
test['inning_ForB'] =  test['inning'].apply(lambda x: omote_ura(x))

In [140]:
test.head()
#train.head()
#test.info()
#train.info()

Unnamed: 0,totalPitchingCount,B,S,O,b1,b2,b3,pitcher,pitcherHand,batter,batterHand,gameID,inning,inning_num,inning_ForB
0,2,1,0,0,False,False,False,遠藤 淳志,R,乙坂 智,L,20202564,2回表,2,0
1,1,0,0,0,False,False,False,バンデンハーク,R,西川 遥輝,L,20202106,3回裏,3,1
2,7,3,2,2,True,False,False,スアレス,R,堂林 翔太,R,20203305,9回裏,9,1
3,1,0,0,2,True,False,False,クック,R,井領 雅貴,L,20202650,3回裏,3,1
4,2,0,0,2,False,False,False,則本 昂大,R,安達 了一,R,20202339,2回表,2,0


In [141]:
# 表裏の列を追加
train['inning_ForB'] =  train['inning'].apply(lambda x: omote_ura(x))
test['inning_ForB'] =  test['inning'].apply(lambda x: omote_ura(x))

In [142]:
# game_infoの追加
train = pd.merge(train, game, how='left')
test = pd.merge(test, game, how='left')

In [143]:
# inningの削除
train = train.drop('inning',axis=1)
test = test.drop('inning',axis=1)

In [144]:
# ボール、ストライク、アウトの合計値を追加
train['total_stat'] = train['B']+train['S']+train['O']
test['total_stat'] = test['B']+test['S']+test['O']
train['B_S'] = train['B']+train['S']
test['B_S'] = test['B']+test['S']

In [145]:
# ベース上のランナーの数を追加
train['total_base'] = train['b1'].astype('int')+train['b2'].astype('int')+train['b3'].astype('int')
test['total_base'] = test['b1'].astype('int')+test['b2'].astype('int')+test['b3'].astype('int')

In [146]:
# バッターのチームを追加
train['batterTeam'] = train['topTeam']
train['batterTeam'] = train['batterTeam'].where(train['inning_ForB']==1, train['bottomTeam'])
test['batterTeam'] = test['topTeam']
test['batterTeam'] = test['batterTeam'].where(test['inning_ForB']==1, test['bottomTeam'])

In [147]:
# ピッチャーのチームを追加
train['pitcherTeam'] = train['topTeam']
train['pitcherTeam'] = train['pitcherTeam'].where(train['inning_ForB']==0, train['bottomTeam'])
test['pitcherTeam'] = test['topTeam']
test['pitcherTeam'] = test['pitcherTeam'].where(test['inning_ForB']==0, test['bottomTeam'])

In [148]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20400 entries, 0 to 20399
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   totalPitchingCount  20400 non-null  int64 
 1   B                   20400 non-null  int64 
 2   S                   20400 non-null  int64 
 3   O                   20400 non-null  int64 
 4   b1                  20400 non-null  bool  
 5   b2                  20400 non-null  bool  
 6   b3                  20400 non-null  bool  
 7   pitcher             20400 non-null  object
 8   pitcherHand         20355 non-null  object
 9   batter              20400 non-null  object
 10  batterHand          20355 non-null  object
 11  gameID              20400 non-null  int64 
 12  inning_num          20400 non-null  object
 13  inning_ForB         20400 non-null  int64 
 14  startTime           20400 non-null  object
 15  bottomTeam          20400 non-null  int64 
 16  bgBottom            20

In [149]:
# カテゴリカル変数のカラムを抽出
categorical_columns = [x for x in train.columns if train[x].dtypes == 'object']

In [150]:
# カテゴリカル変数をカウントエンコードする
#!pip install --user git+https://github.com/pfnet-research/xfeat.git
from xfeat import CountEncoder

encoder = CountEncoder(input_cols=categorical_columns)
train = encoder.fit_transform(train)
test = encoder.transform(test)

In [151]:
# 訓練データにターゲット列を追加する
train['target'] = target

In [152]:
#重複データの表示
#print(train[train_data.duplicated()])

#重複データの削除
#2040-3264=
train = train.drop_duplicates()
print(train)

       totalPitchingCount  B  S  O     b1     b2     b3 pitcher pitcherHand  \
0                       1  0  0  0  False  False  False   今永 昇太           L   
1                       2  1  0  0  False  False  False   今永 昇太           L   
2                       3  1  1  0  False  False  False   今永 昇太           L   
3                       4  2  1  0  False  False  False   今永 昇太           L   
4                       5  2  2  0  False  False  False   今永 昇太           L   
...                   ... .. .. ..    ...    ...    ...     ...         ...   
17131                   2  1  0  2  False  False  False    森 唯斗           R   
17132                   3  1  1  2  False  False  False    森 唯斗           R   
17133                   4  2  1  2  False  False  False    森 唯斗           R   
17134                   5  2  2  2  False  False  False    森 唯斗           R   
17135                   6  3  2  2  False  False  False    森 唯斗           R   

      batter  ... batterTeam  pitcherTeam pitcher_c

In [153]:
# カテゴリカル変数をターゲットエンコーディングする
from sklearn.model_selection import KFold
from xfeat import TargetEncoder

fold = KFold(n_splits=5, shuffle=True, random_state=42)
encoder = TargetEncoder(input_cols=categorical_columns,
                        target_col='target',
                        fold=fold)
train = encoder.fit_transform(train)
test = encoder.transform(test)

In [154]:
print(train.shape)

(17136, 47)


In [155]:
# エンコーディング前の列を削除する
train = train.drop(categorical_columns,axis=1)
test = test.drop(categorical_columns,axis=1)

In [156]:
# ターゲット列を削除
train = train.drop('target',axis=1)

In [157]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17136 entries, 0 to 17135
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   totalPitchingCount  17136 non-null  int64  
 1   B                   17136 non-null  int64  
 2   S                   17136 non-null  int64  
 3   O                   17136 non-null  int64  
 4   b1                  17136 non-null  bool   
 5   b2                  17136 non-null  bool   
 6   b3                  17136 non-null  bool   
 7   gameID              17136 non-null  int64  
 8   inning_ForB         17136 non-null  int64  
 9   bottomTeam          17136 non-null  int64  
 10  bgBottom            17136 non-null  int64  
 11  topTeam             17136 non-null  int64  
 12  bgTop               17136 non-null  int64  
 13  year                17136 non-null  int64  
 14  month               17136 non-null  int64  
 15  day                 17136 non-null  int64  
 16  hour

In [158]:
# pivot tabel を用いた特徴量を追加する関数
def get_game_id_vecs_features(input_df):
    _input_df = input_df
    # pivot table
    stat_df = pd.pivot_table(_input_df, index="gameID", columns="batter_te", values="total_stat").add_prefix("total_stat=")
    base_df = pd.pivot_table(_input_df, index="gameID", columns="batter_te", values="total_base").add_prefix("total_base=")
    inning_df = pd.pivot_table(_input_df, index="gameID", columns="batter_te", values="inning_num_ce").add_prefix("inning=")
    all_df = pd.concat([stat_df, base_df, inning_df], axis=1)
    
    # PCA all 
    sc_all_df = StandardScaler().fit_transform(all_df.fillna(0))
    pca = PCA(n_components=59, random_state=2021)
    pca_all_df = pd.DataFrame(pca.fit_transform(sc_all_df), index=all_df.index).rename(columns=lambda x: f"gameID_all_PCA={x:03}")
    # PCA Stat
    sc_stat_df = StandardScaler().fit_transform(stat_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_stat_df = pd.DataFrame(pca.fit_transform(sc_stat_df), index=all_df.index).rename(columns=lambda x: f"gameID_stat_PCA={x:03}")
    # PCA bace
    sc_base_df = StandardScaler().fit_transform(base_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_base_df = pd.DataFrame(pca.fit_transform(sc_base_df), index=all_df.index).rename(columns=lambda x: f"gameID_base_PCA={x:03}")
    # PCA inning
    sc_inning_df = StandardScaler().fit_transform(inning_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_inning_df = pd.DataFrame(pca.fit_transform(sc_inning_df), index=all_df.index).rename(columns=lambda x: f"gameID_inning_PCA={x:03}")
    
    df = pd.concat([all_df, pca_all_df, pca_stat_df, pca_base_df, pca_inning_df], axis=1)
    output_df = pd.merge(_input_df[["gameID"]], df, left_on="gameID", right_index=True, how="left")
    return output_df

In [159]:
# 訓練データとテストデータを結合する
input_df = pd.concat([train, test]).reset_index(drop=True)  # use concat data

In [160]:
#　ピボットデータを作成する
output_df = get_game_id_vecs_features(input_df)

In [161]:
# ピボットデータを訓練データとテストデータに分割する
train_x = output_df.iloc[:len(train)]
test_x = output_df.iloc[len(train):].reset_index(drop=True)

In [162]:
train_x.shape,test_x.shape,train.shape,test.shape,target.shape,target_speed.shape

((17136, 2685), (33808, 2685), (17136, 39), (33808, 39), (20400,), (20400, 1))

In [163]:
# 元データとピボットデータを結合する
input_all_df = pd.concat([input_df,output_df],axis=1)
input_all_df.shape

(50944, 2724)

In [164]:
# null のカラムの確認
nul_sum = input_all_df.isnull().sum()
null_cols = list(nul_sum[nul_sum > 0].index)

# null があるカラムの削除
input_all_df = input_all_df.drop(null_cols,axis=1)

In [165]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# 分散が0（すべて同じ値）のカラムの探索
sel = VarianceThreshold(threshold=0)
sel.fit(input_all_df)

# get_supportで分散が0でないカラムのみをTrue値、分散が0のカラムはFalse値を返します
print(sum(sel.get_support()))

# 分散が0のカラムを削除
input_all_df =input_all_df.loc[:, sel.get_support()]
print(input_all_df.shape)

145
(50944, 145)


In [166]:
# indexとcolumnsを入れ替える
input_all_df_T = input_all_df.T

print(input_all_df_T.duplicated().sum())

# 同じ特徴量の名前を取得
duplicated_features = input_all_df_T[input_all_df_T.duplicated()].index.values

# 値が同じ特徴量の片方を削除
input_all_df = input_all_df.drop(duplicated_features,axis=1)

print(input_all_df.shape)

1
(50944, 143)


In [167]:
# テストデータと訓練データに分ける
X_train = input_all_df.iloc[:len(train)]
X_test = input_all_df.iloc[len(train):].reset_index(drop=True)

In [168]:
X_train.shape,X_test.shape

((17136, 143), (33808, 143))

In [169]:
# 作成した特徴量のデータを保存しておく
X_train.to_csv('features/preprocessed_train.csv',index=False)
X_test.to_csv('features/preprocessed_test.csv',index=False)
target.to_csv('features/preprocessed_target.csv',index=False)
target_speed.to_csv('features/preprocessed_speed.csv',index=False)

In [170]:
SEED = 42
NFOLDS = 5

In [171]:
# speed のデータを１次元に変換
target_speed = target_speed.to_numpy().reshape(-1,)

In [172]:
#ニューラルネットを作成する関数定義
def create_model_NN(activation, n_layers, n_neurons, solver):
    hidden_layer_sizes=[]
    
    #与えられたパラメータのレイヤを作成
    for i in range(n_layers):
        hidden_layer_sizes.append(n_neurons[i])
    #print('hidden_layer_sizes -> ' + str(hidden_layer_sizes))
    
    #ニューラルネットのモデルを作成
    model = MLPRegressor(activation = activation,
                         hidden_layer_sizes=hidden_layer_sizes,
                         solver = solver,
                         random_state=42
                        )
    #標準化とニューラルネットのパイプラインを作成
    pipe = make_pipeline(StandardScaler(),model)
    return pipe

In [173]:
# テストデータの「Speed」を予測する関数
def pred_speed_of_test_data(train_x,test,target_speed,param):
    ###################################
    ### パラメータの設定
    ##################################
    activation = param['activation']
    n_layers = param['n_layers']
    n_neurons=[]
    for i in range(n_layers):
        n_neurons.append(param['neuron' + str(i).zfill(2)])
    solver = param['solver']
    
    ###################################
    ### CVの設定
    ##################################
    
    FOLD_NUM = 5
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    scores = []
    mlp_pred = 0

    for i, (tdx, vdx) in enumerate(kf.split(X=train_x)):
        X_train, X_valid, y_train, y_valid = train_x.iloc[tdx], train_x.iloc[vdx], target_speed[tdx], target_speed[vdx]
        #モデルを作成
        mlp  = create_model_NN(activation, n_layers, n_neurons, solver)
        # 学習
        mlp.fit(X_train,y_train)
        # 予測
        mlp_pred += mlp.predict(test) / FOLD_NUM

    print('#######################################################')
    print('### Seed was predicted #######')
    print('#######################################################')
    return mlp_pred

In [174]:
# Speed予測用のハイパーパラメータ
param = {
"activation": 'tanh',
"n_layers": 9,
"neuron00": 45,
"neuron01": 52,
"neuron02": 57,
"neuron03": 79,
"neuron04": 21,
"neuron05": 102,
"neuron06": 118,
"neuron07": 31,
"neuron08": 66,
"solver": 'sgd',
}


In [175]:
# テストデータの「Speed」を予測する関数
def pred_speed_of_test_data(train_x,test,target_speed,param):
    ###################################
    ### パラメータの設定
    ##################################
    activation = param['activation']
    n_layers = param['n_layers']
    n_neurons=[]
    for i in range(n_layers):
        n_neurons.append(param['neuron' + str(i).zfill(2)])
    solver = param['solver']
    
    ###################################
    ### CVの設定
    ##################################
    
    FOLD_NUM = 5
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    scores = []
    mlp_pred = 0

    for i, (tdx, vdx) in enumerate(kf.split(X=train_x)):
        X_train, X_valid, y_train, y_valid = train_x.iloc[tdx], train_x.iloc[vdx], target_speed[tdx], target_speed[vdx]
        #モデルを作成
        mlp  = create_model_NN(activation, n_layers, n_neurons, solver)
        # 学習
        mlp.fit(X_train,y_train)
        # 予測
        mlp_pred += mlp.predict(test) / FOLD_NUM

    print('#######################################################')
    print('### Seed was predicted #######')
    print('#######################################################')
    return mlp_pred

# テストデータの『speed』を予測

In [176]:
# テストデータの「Speed」を予測する
speed_pred = pred_speed_of_test_data(X_train,X_test,target_speed,param)

#######################################################
### Seed was predicted #######
#######################################################


# y の学習と予測

In [177]:
# テストデータの「ｙ」を予測する関数
#####################################################3
### LGBで学習、予測する関数の定義
########################################################
def pred_y_of_test_data(train,test,target,lgb_param,mlp_pred,select_col_list):
    # --------------------------------------
    # パラメータ定義
    # --------------------------------------
    lgb_params = {
                    'objective': 'multiclass',
                    'boosting_type': 'gbdt',
                    'n_estimators': 50000,
                    'colsample_bytree': 0.5,
                    'subsample': 0.5,
                    'subsample_freq': 3,
                    'reg_alpha': 8,
                    'reg_lambda': 2,
                    'random_state': SEED,
        'bagging_fraction': lgb_param['bagging_fraction'],
        'bagging_freq': lgb_param['bagging_freq'],        
        'feature_fraction': lgb_param['feature_fraction'],
        "learning_rate":lgb_param['learning_rate'],
        'min_child_samples': lgb_param['min_child_samples'],
        'num_leaves': lgb_param['num_leaves'],
        
                  }

    # --------------------------------------
    # 学習と予測
    # --------------------------------------
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    lgb_oof = np.zeros(train.shape[0])
    lgb_pred = pd.DataFrame()

    train_x = train.loc[:][select_col_list]
    test_x = test.loc[:][select_col_list]

    train_x['speed'] = target_speed.astype('float')
    test_x['speed'] = mlp_pred
    
    target_y = target

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_x)):
        X_train, y_train = train_x.iloc[trn_idx], target_y[trn_idx]
        X_valid, y_valid = train_x.iloc[val_idx], target_y[val_idx]
        X_test = test_x

        # LightGBM
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  eval_metric='logloss',
                  verbose=False,
                  early_stopping_rounds=500
                  )

        lgb_oof[val_idx] = model.predict(X_valid)
        lgb_pred[f'fold_{fold}'] = model.predict(X_test)
        f1_macro = f1_score(y_valid, lgb_oof[val_idx], average='macro')
        print(f"fold {fold} lgb score: {f1_macro}")

    # 予測値の最頻値を求める（ご指摘をいただき修正）
    sub_pred = lgb_pred.mode(axis=1)[0]
    print("+-" * 40)
    print(f"score: {f1_macro}")
    
    return sub_pred

In [178]:
# 「ｙ」を予測するモデルのハイパーパラメータを設定
lgb_param = {
"bagging_fraction": 0.7537281209924886,
"bagging_freq": 5,
"feature_fraction": 0.7548131884427044,
"learning_rate": 0.00854494687558397,
"min_child_samples": 78,
"num_leaves": 209,
}

In [179]:
# 予測に使う特徴量を選択
select_col_list =['B', 'O', 'b1', 'b3', 'bottomTeam', 'topTeam', 'bgTop',
                  'month', 'dayofweek', 'total_stat', 'pitcherTeam',
                  'pitcherHand_ce', 'batter_ce', 'inning_num_ce',
                  'startTime_ce', 'pitcherHand_te', 'batter_te',
                  'inning_num_te', 'startTime_te', 'place_te',
                  'gameID_all_PCA=000', 'gameID_all_PCA=002',
                  'gameID_all_PCA=004', 'gameID_all_PCA=005',
                  'gameID_all_PCA=009', 'gameID_all_PCA=012',
                  'gameID_all_PCA=015', 'gameID_all_PCA=016',
                  'gameID_all_PCA=017', 'gameID_all_PCA=019',
                  'gameID_all_PCA=023', 'gameID_all_PCA=024',
                  'gameID_all_PCA=029', 'gameID_all_PCA=031',
                  'gameID_all_PCA=035', 'gameID_all_PCA=039',
                  'gameID_all_PCA=040', 'gameID_all_PCA=042',
                  'gameID_all_PCA=045', 'gameID_all_PCA=046',
                  'gameID_all_PCA=047', 'gameID_all_PCA=048',
                  'gameID_all_PCA=049', 'gameID_all_PCA=051',
                  'gameID_all_PCA=053', 'gameID_all_PCA=054',
                  'gameID_all_PCA=057', 'gameID_stat_PCA=000',
                  'gameID_stat_PCA=001', 'gameID_stat_PCA=003',
                  'gameID_stat_PCA=004', 'gameID_stat_PCA=005',
                  'gameID_stat_PCA=006', 'gameID_stat_PCA=008',
                  'gameID_stat_PCA=010', 'gameID_stat_PCA=012',
                  'gameID_stat_PCA=014', 'gameID_stat_PCA=015',
                  'gameID_base_PCA=001', 'gameID_base_PCA=005',
                  'gameID_base_PCA=007', 'gameID_base_PCA=008',
                  'gameID_base_PCA=009', 'gameID_base_PCA=011',
                  'gameID_base_PCA=012', 'gameID_base_PCA=013',
                  'gameID_base_PCA=014', 'gameID_base_PCA=015',
                  'gameID_inning_PCA=001', 'gameID_inning_PCA=002',
                  'gameID_inning_PCA=003', 'gameID_inning_PCA=004',
                  'gameID_inning_PCA=006', 'gameID_inning_PCA=008',
                  'gameID_inning_PCA=009', 'gameID_inning_PCA=010',
                  'gameID_inning_PCA=012', 'gameID_inning_PCA=013',
                  'gameID_inning_PCA=014']

In [184]:
#学習と予測の実行
sub_pred = pred_y_of_test_data(X_train,X_test,target,lgb_param,speed_pred,select_col_list)

ValueError: Length of values (20400) does not match length of index (17136)

In [185]:
print(target)

0        0
1        1
2        0
3        2
4        4
        ..
20395    2
20396    0
20397    1
20398    0
20399    1
Name: y, Length: 20400, dtype: int64


In [186]:
# ------------------------------------------------------------------------------
# 提出ファイルの作成
# ------------------------------------------------------------------------------

#テスト結果の出力
submit_df = pd.DataFrame({'y': sub_pred.astype(int)})
submit_df.index.name = 'id'
submit_df.to_csv('submission/submission.csv')

NameError: name 'sub_pred' is not defined