In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.simplefilter('ignore')

from copy import deepcopy
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error , r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from timeit import default_timer as timer

import gc, sys
gc.enable()

import os

In [17]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [18]:
def feature_engineering(is_train=True):
    # When this function is used for the training data, load Train_pubg.csv :
    if is_train: 
        print("processing Train_pubg.csv")
        df = pd.read_csv('C:/Users/korn/Desktop/TNI/Paper/Code/train_pubg.csv')
        
    
    # When this function is used for the test data, load Test_pubg.csv :
    else:
        print("processing Test_pubg.csv")
        df = pd.read_csv('C:/Users/korn/Desktop/TNI/Paper/Code/test_pubg.csv')


    state('totalDistance')
    s = timer()
    # calculate total distance
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    e = timer()
    state('totalDistance', False, e - s)
          
    state('timeSurvived')
    s = timer()
    df['timeSurvived'] = df['timeSurvived'] 
    e = timer()
    state('timeSurvived', False, e - s)

    state('kills')
    s = timer()
    # calculate total kills and assists
    df['kills'] = df['kills'] + (df['assists']/2)
    e = timer()
    state('kills', False, e - s)

    target = 'winPlacePer'
    # Get a list of the features to be used
    features = list(df.columns)
    
    y = None
    
    # If we are processing the training data, process the target
    # (group the data by the match and the group then take the mean of the target) 
    if is_train: 
        y = np.array(df.groupby(['game','teamId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by game and teamId ) :
    print("get group mean feature")
    agg = df.groupby(['game','teamId']).agg('mean')
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    
    # If we are processing the training data let df_out = the grouped  'game' and 'teamId'
    if is_train: 
        df_out = agg.reset_index()[['game','teamId']]
    # If we are processing the test data let df_out = 'game' and 'teamId' without grouping 
    else: 
        df_out = df[['game','teamId']]
    
    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game','teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['game','teamId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by game and teamId )
    print("get group max feature")
    agg = df.groupby(['game','teamId']).agg('max')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game','teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['game','teamId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by game and teamId )
    print("get group min feature")
    agg = df.groupby(['game','teamId']).agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('game').rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['game','teamId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['game','teamId'])
    
    # Make new features indicating the number of players in each group ( grouped by game and teamId )
    print("get group size feature")
    agg = df.groupby(['game','teamId']).size().reset_index(name='group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['game','teamId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['game']).agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['game'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['game']).size().reset_index(name='match_size')
    
    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['game'])
    
    # Drop game and teamId
    df_out.drop(["game","teamId"], axis=1, inplace=True)
    
    # X is the output dataset (without the target) and y is the target :
    X = np.array(df_out, dtype=np.float64)
    
    
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y

In [19]:
X, y = feature_engineering(True)
# X_train, y_train = train_test_split(X,y, test_size = 0.3, random_state = 42)
gridParams ={             
             'learning_rate ': [0.1,0.01,0.05],
             'n_estimatos' : [1000,5000,10000],
             'num_leave' : [5,10,100,300],
             'boosting_type': ['gbdt','dart','goss','rf'],
             'objective': ['mae'],
            } 

mdl = LGBMRegressor(boosting_type= 'gbdt',
          objective= 'mae',          
          n_estimators= 10000,
          num_leaves= 300,
          max_depth= 14,
          learning2rate= 0.05,
          n_jobs= 2,
          colsample_bytree= 0.7,
          verbose= 2 )   

mdl.get_params().keys()

grid = GridSearchCV(mdl, gridParams, verbose=2, n_jobs= 2)

grid.fit(X,y)

print(grid.best_paramas_)
print(grid.best_score_)

processing Train_pubg.csv
Working on totalDistance ... 
Working on totalDistance took (0.013) Sec 

Working on timeSurvived ... 
Working on timeSurvived took (0.0) Sec 

Working on kills ... 
Working on kills took (0.008) Sec 

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  8.5min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 36.2min


In [None]:
# model = lgb.LGBMClassifier( 
#     boosting_type="gbdt",
#     is_unbalance=True, 
#     random_state=10, 
#     n_estimators=50,
#     num_leaves=30, 
#     max_depth=8,
#     feature_fraction=0.5,  
#     bagging_fraction=0.8, 
#     bagging_freq=15, 
#     learning_rate=0.01,    
# )

In [None]:
# params_opt = {'n_estimators':range(200, 600, 80), 'num_leaves':range(20,60,10)}
# gridSearchCV = GridSearchCV(estimator = model, 
#     param_grid = params_opt, 
#     scoring='roc_auc',
#     n_jobs=4,
#     iid=False, 
#     verbose=1,
#     cv=3)
# gridSearchCV.fit(X,y)
# gridSearchCV.grid_scores_, gridSearchCV.best_params_, gridSearchCV.best_score_