In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pubg-finish-placement-prediction/train_V2.csv
/kaggle/input/pubg-finish-placement-prediction/test_V2.csv
/kaggle/input/pubg-finish-placement-prediction/sample_submission_V2.csv


In [2]:
pd.set_option('max_columns',100)

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
train = import_data('../input/pubg-finish-placement-prediction/train_V2.csv')
test = import_data('../input/pubg-finish-placement-prediction/test_V2.csv')

Memory usage of dataframe is 983.90 MB
Memory usage after optimization is: 452.07 MB
Decreased by 54.1%
Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 201.94 MB
Decreased by 51.1%


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype   
---  ------           -----   
 0   Id               category
 1   groupId          category
 2   matchId          category
 3   assists          int8    
 4   boosts           int8    
 5   damageDealt      float16 
 6   DBNOs            int8    
 7   headshotKills    int8    
 8   heals            int8    
 9   killPlace        int8    
 10  killPoints       int16   
 11  kills            int8    
 12  killStreaks      int8    
 13  longestKill      float16 
 14  matchDuration    int16   
 15  matchType        category
 16  maxPlace         int8    
 17  numGroups        int8    
 18  rankPoints       int16   
 19  revives          int8    
 20  rideDistance     float16 
 21  roadKills        int8    
 22  swimDistance     float16 
 23  teamKills        int8    
 24  vehicleDestroys  int8    
 25  walkDistance     float16 
 26  weaponsAcquire

In [6]:
train.shape, test.shape

((4446966, 29), (1934174, 28))

In [7]:
y_train = train['winPlacePerc']
test_ids = test['Id']

train = train.drop(['Id', 'winPlacePerc'], axis=1)
test = test.drop('Id', axis=1)

In [8]:
#dropping missing value
y_train.drop(2744604,inplace=True)
train.drop(2744604,inplace=True)

In [9]:
data = pd.concat([train, test], axis=0).reset_index(drop=True)

In [10]:
data['matchplayers']=data.groupby('matchId')['matchId'].transform('count')
data['traveldistance']=data['walkDistance']+data['swimDistance']+data['rideDistance']
data['healsnboosts']=data['heals']+data['boosts']
data['assist']=data['assists']+data['revives']

data.drop(columns=['walkDistance','swimDistance','rideDistance'],axis=1,inplace=True)
data.drop(columns=['heals','boosts'],axis=1,inplace=True)
data.drop(columns=['assists','revives'],axis=1,inplace=True)
data.drop(columns=['groupId','matchId'],axis=1,inplace=True)

data.head(5)

Unnamed: 0,damageDealt,DBNOs,headshotKills,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,matchplayers,traveldistance,healsnboosts,assist
0,0.0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0,0,1,1466,96,244.75,0,0
1,91.5,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0,0,5,0,91,1445.0,0,0
2,68.0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0,0,2,0,98,161.75,0,1
3,32.90625,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0,0,3,0,91,202.75,0,0
4,100.0,0,0,45,0,1,1,58.53125,1424,solo-fpp,97,95,1560,0,0,0,2,0,97,49.75,0,0


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [9])], remainder='passthrough')
data = pd.DataFrame(ct.fit_transform(data))

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(data)
data = pd.DataFrame(sc.transform(data))

In [13]:
train = data.iloc[:4446965].reset_index(drop='True')
test = data.iloc[4446965:].reset_index(drop='True')

In [14]:
from catboost import CatBoostRegressor
reg = CatBoostRegressor()
reg.fit(train, y_train)

Learning rate set to 0.154374
0:	learn: 0.2704503	total: 560ms	remaining: 9m 19s
1:	learn: 0.2399937	total: 1.02s	remaining: 8m 27s
2:	learn: 0.2149491	total: 1.46s	remaining: 8m 3s
3:	learn: 0.1938740	total: 1.9s	remaining: 7m 52s
4:	learn: 0.1774136	total: 2.33s	remaining: 7m 44s
5:	learn: 0.1637963	total: 2.78s	remaining: 7m 40s
6:	learn: 0.1527174	total: 3.21s	remaining: 7m 35s
7:	learn: 0.1438803	total: 3.64s	remaining: 7m 31s
8:	learn: 0.1366279	total: 4.08s	remaining: 7m 29s
9:	learn: 0.1309436	total: 4.5s	remaining: 7m 25s
10:	learn: 0.1256963	total: 4.94s	remaining: 7m 24s
11:	learn: 0.1218309	total: 5.32s	remaining: 7m 18s
12:	learn: 0.1187762	total: 5.74s	remaining: 7m 15s
13:	learn: 0.1162046	total: 6.16s	remaining: 7m 13s
14:	learn: 0.1133610	total: 6.61s	remaining: 7m 13s
15:	learn: 0.1113039	total: 7.01s	remaining: 7m 11s
16:	learn: 0.1098792	total: 7.43s	remaining: 7m 9s
17:	learn: 0.1081430	total: 7.83s	remaining: 7m 7s
18:	learn: 0.1070196	total: 8.19s	remaining: 7m 3

<catboost.core.CatBoostRegressor at 0x7f9973b15f10>

In [17]:
from sklearn.model_selection import cross_val_score

acc = -cross_val_score(reg,train,y_train,scoring='neg_mean_absolute_error', cv = 3)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.144794
0:	learn: 0.2726898	total: 306ms	remaining: 5m 5s
1:	learn: 0.2438274	total: 610ms	remaining: 5m 4s
2:	learn: 0.2189713	total: 919ms	remaining: 5m 5s
3:	learn: 0.1985053	total: 1.22s	remaining: 5m 3s
4:	learn: 0.1817422	total: 1.51s	remaining: 5m 1s
5:	learn: 0.1679867	total: 1.82s	remaining: 5m 1s
6:	learn: 0.1569142	total: 2.12s	remaining: 5m 1s
7:	learn: 0.1478290	total: 2.42s	remaining: 5m
8:	learn: 0.1403549	total: 2.72s	remaining: 4m 59s
9:	learn: 0.1338520	total: 3s	remaining: 4m 57s
10:	learn: 0.1289256	total: 3.27s	remaining: 4m 54s
11:	learn: 0.1246816	total: 3.56s	remaining: 4m 52s
12:	learn: 0.1207713	total: 3.84s	remaining: 4m 51s
13:	learn: 0.1178537	total: 4.12s	remaining: 4m 50s
14:	learn: 0.1151150	total: 4.42s	remaining: 4m 49s
15:	learn: 0.1132259	total: 4.71s	remaining: 4m 49s
16:	learn: 0.1112843	total: 4.98s	remaining: 4m 47s
17:	learn: 0.1098230	total: 5.25s	remaining: 4m 46s
18:	learn: 0.1081321	total: 5.51s	remaining: 4m 44s
19:	le

In [22]:
acc.mean()

0.056896648792747674

In [18]:
y_pred = reg.predict(test)

In [19]:
submission = pd.DataFrame({'Id':test_ids,'winPlacePerc':y_pred})
submission.to_csv('submission.csv',index=False)
print(submission)

                     Id  winPlacePerc
0        9329eb41e215eb      0.217473
1        639bd0dcd7bda8      0.971402
2        63d5c8ef8dfe91      0.674101
3        cf5b81422591d1      0.499084
4        ee6a295187ba21      0.915568
...                 ...           ...
1934169  a316c3a13887d5      0.758873
1934170  5312146b27d875      0.397364
1934171  fc8818b5b32ad3      0.868753
1934172  a0f91e35f8458f      0.873344
1934173  3696fc9f3a42b2      0.048611

[1934174 rows x 2 columns]
