In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

from copy import deepcopy

from sklearn.metrics import mean_absolute_error , r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import gc, sys
gc.enable()

import os


In [2]:
# Thanks to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def part_of_data(df, part):
    
    game = df['game'].unique()
    game_part = np.random.choice(game, int(part * len(game)))
    
    df = df[df['game'].isin(game_part)]
    
    del game
    del game_part

In [4]:
#def add_new_features_1(df):
    
    # calculate total distance
    #df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']


In [5]:
def add_new_features(df):

    # calculate total distance
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]

    df['timeSurvived'] = df['timeSurvived'] 
    
    # calculate total kills and assists
    df['kills'] = df['kills'] + (df['assists']/2)
    
    
    
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    
    df.fillna(0, inplace=True)

In [6]:
def feature_engineering(df, is_train=True):
      
    features = list(df.columns)
    if 'winPlacePer' in features:
        features.remove('winPlacePer')
    
    y = None
    
    # average y for training dataset
    if is_train:
        y = df.groupby(['game','teamId'])['winPlacePer'].agg('mean')
    elif 'winPlacePerc' in df.columns:
        y = df['winPlacePerc']
    
    # Make new features indicating the mean of the features ( grouped by game and teamId ) :
    print("get group mean feature")
    agg = df.groupby(['game','teamId']).agg('mean')
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    # If we are processing the training data let df_out = the grouped  'game' and 'teamId'
    if is_train:
        df_out = agg.reset_index()[['game','teamId']]
    # If we are processing the test data let df_out = 'game' and 'teamId' without grouping 
    else:
        df_out = df[['game','teamId']]

    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game', 'teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['game', 'teamId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by game and teamId )
    print("get group max feature")
    agg = df.groupby(['game','teamId']).agg('max')
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game','teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['game','teamId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by game and teamId )
    print("get group min feature")
    agg = df.groupby(['game','teamId']).agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game','teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['game','teamId'])
    
    # Make new features indicating the number of players in each group ( grouped by game and teamId )
    print("get group size feature")
    agg = df.groupby(['game','teamId']).size().reset_index(name='group_size')
    
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['game','teamId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['game']).agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['game'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['game']).size().reset_index(name='match_size')

    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['game'])
    
    # Drop game and teamId
    df_out.drop(["game","teamId"], axis=1, inplace=True)
    
    del agg, agg_rank
    
    return df_out, y

In [7]:
class Estimator(object):
    
    def fit(self, x_train, y_train, x_valid, y_valid):
        raise NotImplementedException
    
    def predict(self, x):
        raise NotImplementedException

In [8]:
class ScikitLearnEstimator(Estimator):
    
    def __init__(self, estimator):
        self.estimator = estimator
    
    def fit(self, x_train, y_train, x_valid, y_valid):
        self.estimator.fit(x_train, y_train)
    
    def predict(self, x):
        return self.estimator.predict(x)

In [9]:
def fit_predict_step(estimator, x_train, y_train, train_idx, valid_idx, x_test, oof):
    
    # prepare train and validation data
    x_train_train = x_train[train_idx]
    y_train_train = y_train[train_idx]
    x_train_valid = x_train[valid_idx]
    y_train_valid = y_train[valid_idx]
    
    # fit estimator
    estimator.fit(x_train_train, y_train_train, x_train_valid, y_train_valid)
    
    # collect OOF
    oof_part = estimator.predict(x_train_valid)
    
    print('MAE:', mean_absolute_error(y_train_valid, oof_part))
    oof[valid_idx] = oof_part
    
    # make predictions for test data
    y_part = estimator.predict(x_test)
    
    return y_part

In [10]:
def fit_predict(estimator, x_train, y_train, x_test):
    
    oof = np.zeros(x_train.shape[0])
    
    y = np.zeros(x_test.shape[0])
    
    kf = KFold(n_splits=5, random_state=42)
    
    for train_idx, valid_idx in kf.split(x_train):
        
        y_part = fit_predict_step(estimator, x_train, y_train, train_idx, valid_idx, x_test, oof)
        
        # average predictions for test data
        y += y_part / kf.n_splits
    
    print('Final MAE:', mean_absolute_error(y_train, oof))
    print('Final MSE:', mean_squared_error(y_train, oof))
    print('r2:', r2_score(y_train, oof))
    return oof, y

In [11]:
def fit_step(estimator, x_train, y_train, train_idx, valid_idx, oof):
    
    # prepare train and validation data
    x_train_train = x_train[train_idx]
    y_train_train = y_train[train_idx]
    x_train_valid = x_train[valid_idx]
    y_train_valid = y_train[valid_idx]
    
    # fit estimator
    estimator.fit(x_train_train, y_train_train, x_train_valid, y_train_valid)
    
    # collect OOF
    oof_part = estimator.predict(x_train_valid)
    
    mae = mean_absolute_error(y_train_valid, oof_part)
    print('MAE:', mae)
    
    oof[valid_idx] = oof_part
    
    return estimator, mae

In [12]:
def fit(estimator, x_train, y_train):
    
    oof = np.zeros(x_train.shape[0])
    
    kf = KFold(n_splits=5, random_state=42)
    
    trained_estimators = []
    
    for train_idx, valid_idx in kf.split(x_train):
        
        e, mae = fit_step(estimator, x_train, y_train, train_idx, valid_idx, oof)
        
        trained_estimators.append(deepcopy(e))
    
    print('Final MAE:', mean_absolute_error(y_train, oof))
    print('Final MSE:', mean_squared_error(y_train, oof))
    print('r2:', r2_score(y_train, oof))
    return oof, trained_estimators

In [13]:
def predict(trained_estimators, x_test):
    
    y = np.zeros(x_test.shape[0])
    
    for estimator in trained_estimators:
        
        y_part = estimator.predict(x_test)
        
        # average predictions for test data
        y += y_part / len(trained_estimators)
    
    return y

In [14]:
def pipeline_fit(estimator, df_train, scaler=None):
    
    # add new features
    add_new_features(df_train)
    
    # feature engineering
    x_train, y_train = feature_engineering(df_train, is_train=True)
    x_train = reduce_mem_usage(x_train)
    gc.collect()
    
    # scale
    if not (scaler is None):
        scaler.fit(x_train)
        scaled_x_train = scaler.transform(x_train)
    else:
        scaled_x_train = x_train.values
    
    del x_train
    gc.collect()
    
    # fit
    oof, trained_estimators = fit(estimator, scaled_x_train, y_train.values)
    
    del scaled_x_train
    del y_train
    gc.collect()
    
    return oof, trained_estimators

In [15]:
def pipeline_predict(trained_estimators, df_test, scaler=None):
    
    # add new features
    add_new_features(df_test)
    
    # feature engineering
    x_test, _ = feature_engineering(df_test, is_train=False)
    x_test = reduce_mem_usage(x_test)
    gc.collect()
    
    # scale
    if not (scaler is None):
        scaled_x_test = scaler.transform(x_test)
    else:
        scaled_x_test = x_test.values
    
    del x_test
    gc.collect()
    
    # predict
    y = predict(trained_estimators, scaled_x_test)
    
    del scaled_x_test
    gc.collect()
    
    return y

In [16]:
df_train = pd.read_csv('C:/Users/korn/Desktop/TNI/Paper/Code/train_pubg.csv')
df_train.shape

(13432, 24)

In [17]:
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 2.46 MB
Memory usage after optimization is: 0.40 MB
Decreased by 83.9%


In [18]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
Tournament,1.0,1.0,8.0,1.0,2.0
DBNOs,0.0,0.0,1.0,4.0,2.0
assists,0.0,0.0,1.0,5.0,0.0
boosts,2.0,0.0,11.0,7.0,2.0
damageDealt,18.453125,119.0,141.125,763.5,132.625
headshotKills,0.0,0.0,0.0,2.0,0.0
heals,0.0,0.0,29.0,3.0,1.0
killPlace,52.0,62.0,44.0,2.0,56.0
killStreaks,0.0,0.0,0.0,2.0,0.0
kills,0.0,0.0,0.0,4.0,0.0


In [19]:
gc.collect()

40

In [20]:
df_train.drop(df_train[df_train['winPlacePer'].isnull()].index, inplace=True)

In [21]:
import lightgbm as lgb

In [22]:
class LightGBM(Estimator):
    
    def __init__(self, params):
        self.params = params
    
    def fit(self, x_train, y_train, x_valid, y_valid):
        
        lgb_train = lgb.Dataset(data=x_train.astype('float64'), label=y_train.astype('float64'))
        lgb_valid = lgb.Dataset(data=x_valid.astype('float64'), label=y_valid.astype('float64'))
        
        self.lgb_model = lgb.train(self.params, lgb_train, valid_sets=lgb_valid, verbose_eval=1000)
    
    def predict(self, x):
        return self.lgb_model.predict(x.astype('float64'), num_iteration=self.lgb_model.best_iteration)

In [23]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'mae',
          'n_estimators': 10000,
          'early_stopping_rounds': 100,
          'num_leaves': 300,
          'max_depth': 14,
          'bagging_fraction': 0.8,
          'learning_rate': 0.05,
          'bagging_seed': 0,
          'num_threads': 4,
          'colsample_bytree': 0.7,
          'verbosity': -1 }

In [24]:
%%time

# scaler = StandardScaler()
oof, trained_estimators = pipeline_fit(LightGBM(params), df_train)

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 0.85 MB
Memory usage after optimization is: 0.28 MB
Decreased by 67.1%
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's l1: 0.00236118
MAE: 0.0023611773408343906
Finished loading model, total used 107 iterations
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[112]	valid_0's l1: 0.00273723
MAE: 0.002737229370881817
Finished loading model, total used 112 iterations
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[121]	valid_0's l1: 0.00284993
MAE: 0.0028499285439259523
Finished loading model, total used 121 iterations
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 0.00500141
MAE: 0.005001414107609

In [25]:
del df_train

gc.collect()

20

In [26]:
df_test = pd.read_csv('C:/Users/korn/Desktop/TNI/Paper/Code/test_pubg.csv')
df_test.shape

(5757, 24)

In [27]:
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 1.05 MB
Memory usage after optimization is: 0.17 MB
Decreased by 83.8%


In [28]:
df_test_id = pd.DataFrame(index=df_test.index)

In [29]:
gc.collect()

80

In [30]:
y = pipeline_predict(trained_estimators, df_test)

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 5.12 MB
Memory usage after optimization is: 1.69 MB
Decreased by 67.1%


In [31]:
del df_test

gc.collect()

20

In [32]:
df_oof = pd.DataFrame()
df_oof['lgb_oof'] = oof
df_oof.to_csv('light_gbm_oof.csv', index_label='id')

In [33]:
df_submission = pd.DataFrame(index=df_test_id.index)
df_submission['winPlacePer'] = y
df_submission.to_csv('light_gbm_raw.csv', index_label='id')