Automated Feature Engineering 

In [18]:
import pandas as pd
import numpy as np
import framequery as fq
import hashlib
import featuretools as ft
from featuretools import Feature 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [19]:
df = pd.read_csv("train1.csv")
df = df.head(250000)

In [20]:
df.isnull().sum()
# Brak brakujących wartośći

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

In [21]:
df.groupby('matchType').head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,75c10f333dfdb2,312d33be84f802,f98cefba51a79c,0,0,126.00,2,0,0,56,...,1,0.0,0,0.00,0,0,1463.00,5,1490,0.5000
1,3c9cd47c845b8f,4cfa90a5da9216,cf68f99a6d40fe,0,0,100.00,0,1,2,33,...,1,0.0,0,0.00,0,0,218.30,2,0,0.3333
2,537fae74d33487,a33d482531298a,faed6a22ecb949,0,0,145.00,2,0,0,75,...,0,0.0,0,0.00,1,0,642.90,1,1511,0.2128
3,8fbc99f2b72da6,157ec1680cae7c,49318240e11fe2,0,0,0.00,0,0,2,55,...,0,0.0,0,0.00,0,0,2951.00,5,1500,0.6383
4,15941736eba92d,6bf0b088393514,9fc250a4f0d6d7,0,3,111.30,2,1,14,31,...,0,637.6,0,0.00,0,0,1917.00,3,1491,0.6296
5,90f838f5838e43,0310194f0beb21,63cb73d87789bc,0,7,685.10,0,2,3,2,...,0,0.0,0,0.00,0,0,2641.00,6,1453,0.9694
6,cb57e6e5560d39,ae3394dc211a18,f9f5a94af344ec,0,0,0.00,0,0,1,53,...,0,0.0,0,0.00,0,0,2085.00,10,0,0.7551
7,e6b8beaf8b4d66,cf567a60174e1e,4c87489c3fb46a,0,1,139.70,0,0,0,56,...,0,3289.0,0,0.00,0,0,1773.00,6,0,0.6207
8,e7b238e276b50d,307232d29befa6,6d9eca79d9e756,2,2,103.60,0,0,0,43,...,2,0.0,0,0.00,0,0,2370.00,6,1530,0.9574
9,67c0dad5ab4267,2e899451072441,3594728c5ff83b,0,0,65.79,1,0,0,66,...,0,0.0,0,0.00,1,0,1446.00,6,0,0.5517


In [22]:
df['Id'].is_unique
# Each id value is unique 

True

In [23]:
target = 'winPlacePerc'

In [24]:
train , test = train_test_split(df, test_size = 0.2)

In [25]:
x_train = train.drop(target, axis=1)
y_train = train[target]

# x_test = test.drop(target, axis = 1)
# y_test = test[target]

In [26]:
x_train.isnull().values.any()

False

In [27]:
groupId = x_train['groupId']
matchId = x_train['matchId']

In [28]:
#creating and entity set 'es'
es = ft.EntitySet(id = 'Game_Id')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'id', dataframe = x_train, index = 'index')



Entityset: Game_Id
  Entities:
    id [Rows: 200000, Columns: 29]
  Relationships:
    No relationships

In [29]:
es.normalize_entity(base_entity_id='id', new_entity_id='player', index = 'index', 
additional_variables = [
'assists', 
'damageDealt', 
'headshotKills', 
'killPlace', 
'killPoints',
'killStreaks', 
'kills', 
'longestKill', 
'roadKills', 
'vehicleDestroys', 
'DBNOs'
])

Entityset: Game_Id
  Entities:
    id [Rows: 200000, Columns: 18]
    player [Rows: 200000, Columns: 12]
  Relationships:
    id.index -> player.index

In [30]:
es.normalize_entity(base_entity_id='id', new_entity_id='teamCoop', index = 'groupId', 
additional_variables = [
'revives',
'teamKills'
])

Entityset: Game_Id
  Entities:
    id [Rows: 200000, Columns: 16]
    player [Rows: 200000, Columns: 12]
    teamCoop [Rows: 190553, Columns: 3]
  Relationships:
    id.index -> player.index
    id.groupId -> teamCoop.groupId

In [31]:
es.normalize_entity(base_entity_id='id', new_entity_id='Movement', index = 'index', 
additional_variables = [

'rideDistance', 
'swimDistance', 
'walkDistance',
'weaponsAcquired',
'boosts',
'heals'
])

Entityset: Game_Id
  Entities:
    id [Rows: 200000, Columns: 10]
    player [Rows: 200000, Columns: 12]
    teamCoop [Rows: 190553, Columns: 3]
    Movement [Rows: 200000, Columns: 7]
  Relationships:
    id.index -> player.index
    id.groupId -> teamCoop.groupId
    id.index -> Movement.index

In [32]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'id', 
max_depth = 2, 
verbose = 1, 
n_jobs = -1)

Built 142 features


distributed.utils - ERROR - 
Traceback (most recent call last):
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\utils.py", line 713, in log_errors
    yield
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\client.py", line 1246, in _close
    quiet_exceptions=(CancelledError,),
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 965, in with_timeout
    chain_future(future, result)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 611, in chain_future
    future_add_done_callback(a, copy)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 658, in future_add_done_callback
    callback(future)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 606, in copy
 

EntitySet scattered to 10 workers in 43 seconds
Elapsed: 03:45 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 10/10 chunks


In [33]:
feature_matrix.shape

(200000, 142)

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\utils.py", line 713, in log_errors
    yield
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\client.py", line 1246, in _close
    quiet_exceptions=(CancelledError,),
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 965, in with_timeout
    chain_future(future, result)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 611, in chain_future
    future_add_done_callback(a, copy)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 658, in future_add_done_callback
    callback(future)
  File "c:\users\sebastian\ap

In [34]:
y_train.shape

(200000,)

In [35]:
x_train = feature_matrix

In [36]:
from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=11)



In [37]:
x_train.head()

Unnamed: 0_level_0,Id,groupId,matchId,matchDuration,matchType,maxPlace,numGroups,rankPoints,winPoints,player.assists,...,Movement.MEAN(id.rankPoints),Movement.MEAN(id.winPoints),Movement.NUM_UNIQUE(id.Id),Movement.NUM_UNIQUE(id.groupId),Movement.NUM_UNIQUE(id.matchId),Movement.NUM_UNIQUE(id.matchType),Movement.MODE(id.Id),Movement.MODE(id.groupId),Movement.MODE(id.matchId),Movement.MODE(id.matchType)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
138412,f508a48c9e9d9f,9b99b301ee3527,661f2d1d5b5075,1916,duo-fpp,49,47,1535,0,1,...,1535,0,1,1,1,1,f508a48c9e9d9f,9b99b301ee3527,661f2d1d5b5075,duo-fpp
105651,6f685e818a797f,606a50ee1221b5,11d577d478f3d5,1343,squad-fpp,28,28,-1,1489,0,...,-1,1489,1,1,1,1,6f685e818a797f,606a50ee1221b5,11d577d478f3d5,squad-fpp
175345,d5bc93fadad3fa,c65b437f5283fe,b7f7692812bfc0,1874,duo-fpp,48,46,1517,0,0,...,1517,0,1,1,1,1,d5bc93fadad3fa,c65b437f5283fe,b7f7692812bfc0,duo-fpp
107238,5fbeaffa2d4c05,290b21c35f7e0f,a8488071e08d9d,1422,solo,98,96,-1,1571,0,...,-1,1571,1,1,1,1,5fbeaffa2d4c05,290b21c35f7e0f,a8488071e08d9d,solo
61210,7963e84df155cf,5b8b05e0d0fda7,4d126c113bfb79,1228,duo-fpp,46,45,1730,0,0,...,1730,0,1,1,1,1,7963e84df155cf,5b8b05e0d0fda7,4d126c113bfb79,duo-fpp


In [38]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,scale
from sklearn import neighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np

In [39]:
x_train = x_train.select_dtypes(exclude=['object'])

In [40]:
x_train.dtypes

matchDuration                          int64
maxPlace                               int64
numGroups                              int64
rankPoints                             int64
winPoints                              int64
player.assists                         int64
player.damageDealt                   float64
player.headshotKills                   int64
player.killPlace                       int64
player.killPoints                      int64
player.killStreaks                     int64
player.kills                           int64
player.longestKill                   float64
player.roadKills                       int64
player.vehicleDestroys                 int64
player.DBNOs                           int64
teamCoop.revives                       int64
teamCoop.teamKills                     int64
Movement.rideDistance                float64
Movement.swimDistance                float64
Movement.walkDistance                float64
Movement.weaponsAcquired               int64
Movement.b

In [41]:
x_train.shape

(150000, 127)

In [42]:
x_train.isnull().values.any()

True

In [43]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [44]:
x_train.isnull().sum()

matchDuration                             0
maxPlace                                  0
numGroups                                 0
rankPoints                                0
winPoints                                 0
player.assists                            0
player.damageDealt                        0
player.headshotKills                      0
player.killPlace                          0
player.killPoints                         0
player.killStreaks                        0
player.kills                              0
player.longestKill                        0
player.roadKills                          0
player.vehicleDestroys                    0
player.DBNOs                              0
teamCoop.revives                          0
teamCoop.teamKills                        0
Movement.rideDistance                     0
Movement.swimDistance                     0
Movement.walkDistance                     0
Movement.weaponsAcquired                  0
Movement.boosts                 

In [45]:
x_train = x_train.dropna(axis='columns')

In [46]:
x_train.isnull().sum()

matchDuration                        0
maxPlace                             0
numGroups                            0
rankPoints                           0
winPoints                            0
player.assists                       0
player.damageDealt                   0
player.headshotKills                 0
player.killPlace                     0
player.killPoints                    0
player.killStreaks                   0
player.kills                         0
player.longestKill                   0
player.roadKills                     0
player.vehicleDestroys               0
player.DBNOs                         0
teamCoop.revives                     0
teamCoop.teamKills                   0
Movement.rideDistance                0
Movement.swimDistance                0
Movement.walkDistance                0
Movement.weaponsAcquired             0
Movement.boosts                      0
Movement.heals                       0
player.SUM(id.matchDuration)         0
player.SUM(id.maxPlace)  

In [47]:
x_train.shape

(150000, 97)

In [48]:
clf = GradientBoostingRegressor(n_estimators=200,min_samples_leaf=2,max_depth=3)
clf.fit(x_train, y_train)


distributed.utils - ERROR - 
Traceback (most recent call last):
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\utils.py", line 713, in log_errors
    yield
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\distributed\client.py", line 1246, in _close
    quiet_exceptions=(CancelledError,),
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 965, in with_timeout
    chain_future(future, result)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 611, in chain_future
    future_add_done_callback(a, copy)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 658, in future_add_done_callback
    callback(future)
  File "c:\users\sebastian\appdata\local\programs\python\python36\lib\site-packages\tornado\concurrent.py", line 606, in copy
 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=2, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=200,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
x_valid = x_valid.select_dtypes(exclude=['object'])
x_valid = x_valid.dropna(axis='columns')

In [50]:
y_pred = clf.predict(x_valid)


In [51]:
error = mean_absolute_error(y_valid,y_pred) #calculate err
print('MAE value  is:', error)

MAE value  is: 0.06557679805250811


In [None]:
error_RMSE = math.sqrt(mean_squared_error(y_test,y_pred)) #calculate err
print('RMSE value  is:', error_RMSE)
r2 = r2_score(y_test, y_pred)
print('R2: ', r2)