In [29]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

from sklearn.ensemble import RandomForestRegressor

import os
print(os.listdir("../Final/dataset"))

['data_clean(2).csv', 'data_clean(re).csv', 'data_clean.csv', 'subjects_data.csv', 'dbworld_bodies.mat', '.ipynb_checkpoints', 'r2_Finetune_train.csv', 'Finetune_train.csv', 'Finetune_test.csv', 'train_V2.csv', 'n_estimators5.csv', 'r2_Finetune_test.csv', 'dbworld_subjects.mat']


### Memorry Saving

In [2]:
#reference:https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

### Load Data

In [3]:
df_multi = pd.read_csv('../Final/dataset/data_clean(2).csv', index_col='Id')
df_multi.shape

(4403464, 34)

In [4]:
df_multi = df_multi.loc[df_multi['matchType'].isin(['solo','solo-fpp','duo','duo-fpp','squad','squad-fpp'])]

### Drop features with low correlation

In [5]:
df_multi.drop(['kills','damageDealt','killPoints','matchDuration','maxPlace','numGroups','rankPoints','roadKills',\
              'teamKills','winPoints'], axis=1, inplace=True)

In [10]:
df_multi.drop('abnormal_numGroups',axis=1, inplace=True)

In [11]:
df_multi.head().T

Id,7f96b2f878858a,eef90569b9d03c,1eaf90ac73de72,4616d365dd2853,315c96c26c9aac
Unnamed: 0,0,1,2,3,4
groupId,4d4b580de459be,684d5656442f9e,6a4a42c3245a74,a930a9c79cd721,de04010b3458dd
matchId,a10357fd1a4a91,aeb375fc57110c,110163d8bb94ae,f1f1f4ef412d7e,6dc8ff871e21e6
assists,0,0,1,0,0
boosts,0,0,0,0,0
DBNOs,0,0,0,0,0
headshotKills,0,0,0,0,0
heals,0,0,0,0,0
killPlace,60,57,47,75,45
killStreaks,0,0,0,0,1


In [12]:
df_multi = reduce_mem_usage(df_multi)

In [8]:
df_multi = df_multi.dropna()

In [13]:
df_multi.shape

(4403464, 23)

### Split data into train and test

In [14]:
train_multi = df_multi.sample(frac=0.8,random_state=200,axis = 0)
test_multi = df_multi.drop(train_multi.index)

print ("train_multi: " + format(train_multi.shape))
print ("test_multi: " + format(test_multi.shape))

train_multi: (3522771, 23)
test_multi: (880693, 23)


### Feature Engineering

In [19]:
def feature_engineering(df, is_train=True):
    features = list(df.columns)
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    features.remove('winPlacePerc')
    
    y = None
    if is_train:
        y = df.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean')
    else:
        y = df['winPlacePerc']
    
    # max by match and group
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    
    # drop match id and group id
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    del agg
    
    return df_out, y
    
    

In [20]:
train_x, train_y = feature_engineering(train_multi, is_train=True)

In [21]:
train_x.shape

(1826370, 19)

In [None]:
del train_multi

In [22]:
test_x, test_y = feature_engineering(test_multi, is_train = False)

In [23]:
test_x.shape

(880693, 19)

In [None]:
test_y.shape

In [None]:
del test_multi

### RandomForest Baseline

In [27]:
rf = RandomForestRegressor(n_estimators=100, max_depth=20, criterion='mse', n_jobs=6)
rf.fit(train_x, train_y)
rf_y_pred = rf.predict(test_x)

In [28]:
mse = mean_squared_error(test_y, rf_y_pred)
mae = mean_absolute_error(test_y, rf_y_pred)
median_abosolute_error = median_absolute_error(test_y, rf_y_pred)
r2 = r2_score(test_y, rf_y_pred)
print("mse: " + format(mse))
print("rmse: " + format(mse**0.5))
print("mae: " + format(mae))
print("median abosolute error: " + format(median_abosolute_error))
print("r2: " + format(r2))

mse: 0.006932162039195814
rmse: 0.08325960628777807
mae: 0.059533268830981485
median abosolute error: 0.042951599654951766
r2: 0.9262997262756647


### Model Tuning

In [39]:
prediction = {}
prediction['mse'] = {}
prediction['rmse'] = {}
prediction['mae'] = {}
prediction['median_ae'] = {}
prediction['r2'] = {}

ground= {}
ground['mse'] = {}
ground['rmse'] = {}
ground['mae'] = {}
ground['median_ae'] = {}
ground['r2'] = {}

#### Finding a reasonable n_estimators value

In [None]:
for i in range(30,100,10):
    rf = RandomForestRegressor(n_estimators=i, max_depth=50, criterion='mse', n_jobs=6)
    rf.fit(train_x, train_y)
    rf_y_pred = rf.predict(train_x)
    rf_y_pred_test = rf.predict(test_x)
    mse = mean_squared_error(train_y, rf_y_pred)
    rmse = mse**0.5
    mae = mean_absolute_error(train_y, rf_y_pred)
    median_abosolute_error = median_absolute_error(train_y, rf_y_pred)
    r2 = r2_score(train_y, rf_y_pred)
    ground['mse'][i] = mse
    ground['rmse'][i] = rmse
    ground['mae'][i] = mae
    ground['median_ae'][i] = median_abosolute_error
    ground['r2'][i] = r2
    
    mse = mean_squared_error(test_y, rf_y_pred_test)
    rmse = mse**0.5
    mae = mean_absolute_error(test_y, rf_y_pred_test)
    median_abosolute_error = median_absolute_error(test_y, rf_y_pred_test)
    r2 = r2_score(test_y, rf_y_pred_test)
    prediction['mse'][i] = mse
    prediction['rmse'][i] = rmse
    prediction['mae'][i] = mae
    prediction['median_ae'][i] = median_abosolute_error
    prediction['r2'][i] = r2
    
    

#### Finding the coarse range of optimal "max_depth"

In [None]:
for i in range(10,100,20):
    rf = RandomForestRegressor(n_estimators=30, max_depth=i, criterion='mse', n_jobs=6)
    rf.fit(train_x, train_y)
    rf_y_pred = rf.predict(train_x)
    rf_y_pred_test = rf.predict(test_x)
    mse = mean_squared_error(train_y, rf_y_pred)
    rmse = mse**0.5
    mae = mean_absolute_error(train_y, rf_y_pred)
    median_abosolute_error = median_absolute_error(train_y, rf_y_pred)
    r2 = r2_score(train_y, rf_y_pred)
    ground['mse'][i] = mse
    ground['rmse'][i] = rmse
    ground['mae'][i] = mae
    ground['median_ae'][i] = median_abosolute_error
    ground['r2'][i] = r2
    
    mse = mean_squared_error(test_y, rf_y_pred_test)
    rmse = mse**0.5
    mae = mean_absolute_error(test_y, rf_y_pred_test)
    median_abosolute_error = median_absolute_error(test_y, rf_y_pred_test)
    r2 = r2_score(test_y, rf_y_pred_test)
    prediction['mse'][i] = mse
    prediction['rmse'][i] = rmse
    prediction['mae'][i] = mae
    prediction['median_ae'][i] = median_abosolute_error
    prediction['r2'][i] = r2

#### Finding the relatively precise optimal "max_depth"

In [None]:
for i in range(15,65,5):
    rf = RandomForestRegressor(n_estimators=90, max_depth=i, criterion='mse', n_jobs=6)
    rf.fit(train_x, train_y)
    rf_y_pred = rf.predict(train_x)
    rf_y_pred_test = rf.predict(test_x)
    mse = mean_squared_error(train_y, rf_y_pred)
    rmse = mse**0.5
    mae = mean_absolute_error(train_y, rf_y_pred)
    median_abosolute_error = median_absolute_error(train_y, rf_y_pred)
    r2 = r2_score(train_y, rf_y_pred)
    ground['mse'][i] = mse
    ground['rmse'][i] = rmse
    ground['mae'][i] = mae
    ground['median_ae'][i] = median_abosolute_error
    ground['r2'][i] = r2
    
    mse = mean_squared_error(test_y, rf_y_pred_test)
    rmse = mse**0.5
    mae = mean_absolute_error(test_y, rf_y_pred_test)
    median_abosolute_error = median_absolute_error(test_y, rf_y_pred_test)
    r2 = r2_score(test_y, rf_y_pred_test)
    prediction['mse'][i] = mse
    prediction['rmse'][i] = rmse
    prediction['mae'][i] = mae
    prediction['median_ae'][i] = median_abosolute_error
    prediction['r2'][i] = r2

#### Finally, using the processed data and determined model parameters for final model traning (run the code in RandomForest Baseline)