In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

from sklearn.ensemble import RandomForestRegressor


In [2]:
#reference:https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [3]:
df_multi = pd.read_csv('data_clean.csv', index_col='Id')
df_multi.shape

(4406356, 33)

In [4]:
df_multi = reduce_mem_usage(df_multi)

In [5]:
df_multi.drop(['kills','damageDealt'], axis=1, inplace=True)

In [6]:
df_multi.shape

(4406356, 31)

In [7]:
train_multi = df_multi.sample(frac=0.8,random_state=200,axis = 0)
test_multi = df_multi.drop(train_multi.index)

print ("train_multi: " + format(train_multi.shape))
print ("test_multi: " + format(test_multi.shape))

train_multi: (3525085, 31)
test_multi: (881271, 31)


In [8]:
def feature_engineering(df, is_train=True):
    df['rankPoints'] = np.where(df['rankPoints'] <= 0, 0, df['rankPoints'])

    features = list(df.columns)
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    features.remove('winPlacePerc')
    
    y = None
    if is_train:
        y = df.groupby(['matchId','groupId'])['winPlacePerc'].agg('mean')
    
    else:
        y = df['winPlacePerc']
    
    # max by match and group
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    # for the "mean" and "min" model, just replace the agg('max') to agg('mean') and agg('min')   
    
    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    
    # drop match id and group id
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    del agg
    
    return df_out, y

    

In [9]:
train_x, train_y = feature_engineering(train_multi, is_train=True)

In [10]:
train_x.shape

(1825519, 27)

In [11]:
del train_multi

In [12]:
test_x, test_y = feature_engineering(test_multi, is_train = False)

In [13]:
test_x.shape

(881271, 27)

In [14]:
test_y.shape

(881271,)

In [15]:
del test_multi

In [16]:
rf = RandomForestRegressor(n_estimators=10, criterion='mse', n_jobs=1)
rf.fit(train_x, train_y)
rf_y_pred = rf.predict(test_x)

In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(test_y, rf_y_pred)
mae = mean_absolute_error(test_y, rf_y_pred)
median_abosolute_error = median_absolute_error(test_y, rf_y_pred)
r2 = r2_score(test_y, rf_y_pred)
print("mse: " + format(mse))
print("rmse: " + format(mse**0.5))
print("mae: " + format(mae))
print("median abosolute error: " + format(median_abosolute_error))
print("r2: " + format(r2))

mse: 0.007779995546336329
rmse: 0.08820428303850289
mae: 0.06264257294348112
median abosolute error: 0.04477539062500002
r2: 0.9176451082452928
