In [21]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [5]:
PATH_TO_DATA = './data'
train_df = pd.read_csv(os.path.join(PATH_TO_DATA,'train_features.csv'), index_col='match_id_hash')
target_df = pd.read_csv(os.path.join(PATH_TO_DATA,'train_targets.csv'), index_col='match_id_hash')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), index_col='match_id_hash')

In [6]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [15]:
X = train_df.reset_index(drop=True)
y = target_df['radiant_win']
X_test = test_df.copy().reset_index(drop=True)

# LightGBM

In [20]:
lgb_params = {'boost': 'gbdt',
          'feature_fraction': 0.05,
          'learning_rate': 0.01,
          'max_depth': -1,  
          'metric':'auc',
          'min_data_in_leaf': 50,
          'num_leaves': 32,
          'num_threads': -1,
          'verbosity': 1,
          'objective': 'binary'
         }

In [23]:
feature_importance = pd.DataFrame()
scores = []
prediction = np.zeros(len(X_test))

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    X_train, X_valid = X.loc[train_index], X.loc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    train_dataset = lgb.Dataset(X_train, label=y_train)
    valid_dataset = lgb.Dataset(X_valid, label=y_valid)
    
    model = lgb.train(lgb_params, 
                      train_dataset, 
                      num_boost_round=20000,
                      valid_sets = [train_data, valid_data],
                      verbose_eval=1000,
                      early_stopping_rounds=200)
    
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    # scores
    scores.append(roc_auc_score(y_valid, y_pred_valid))

    # Summing the predictions over 5 models to get average
    prediction += y_pred
    
    # feature importance
    fold_importance = pd.DataFrame()
    fold_importance["feature"] = X.columns
    fold_importance["importance"] = model.feature_importance()
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    

    
prediction /= n_fold
print('CV mean score: {0:.4f}, std: {1:.4f}'.format(np.mean(scores), np.std(scores)))

NameError: name 'train_data' is not defined

In [None]:


train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_valid, y_valid)

