# Classifiers with Feature Engineering

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV 
from sklearn.linear_model import LogisticRegression
from src.utils import preprocess, feature_engineering

## Loading data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')

## Preprocessing

**Feature Engineering**

In [11]:
def extract_main_features(data_X_test):
    filter_col = [col for col in data_X_test if col.startswith('miss') or col.startswith('offensive rebound') or col.startswith('score') or col.startswith('assist')
                                            or col.startswith('two pts') or col.startswith('three pts') or col.startswith('fg') or col.startswith('ID')]
    data_X_test = data_X_test[filter_col]
    return data_X_test

In [5]:
X_train = extract_main_features(X_train)

In [3]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_train.head()

100%|██████████████████████████████████████████████████████████████████████████████| 1439/1439 [00:59<00:00, 24.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [03:52<00:00,  6.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [01:26<00:00, 16.66it/s]


Unnamed: 0,ID,score_1,offensive rebound_1,defensive rebound_1,offensive foul_1,defensive foul_1,assist_1,lost ball_1,steals_1,bad pass_1,...,defensive foul_1440,assist_1440,lost ball_1440,steals_1440,bad pass_1440,block_1440,miss_1440,two pts_1440,three pts_1440,fg_1440
0,14186,-2,0,0,0,0,0,0,0,0,...,0,-3,3,3,-2,1,9,-6,-1,-7
1,13013,0,0,-1,0,0,0,0,0,0,...,0,1,0,0,1,-2,-1,-7,3,-4
2,7102,0,0,0,0,0,0,0,1,1,...,0,0,5,5,-2,3,-5,1,1,2
3,7637,-2,0,0,0,0,0,0,0,0,...,0,-1,-1,2,2,-1,-1,-2,0,-2
4,12350,0,0,0,0,0,0,0,0,0,...,0,4,3,2,1,1,3,-2,2,0


In [42]:
X_train, Y_train, X_val, Y_val = preprocess(Data_X_train, Data_Y_train, 0.8)

In [40]:
Data_X_train['score_1440']

0       -13
1        -5
2         5
3        -1
4         1
5        10
6        -3
7        -9
8        -8
9        -2
10       -1
11        1
12      -14
13      -16
14       -6
15        0
16       10
17        7
18        4
19        1
20       12
21        2
22       11
23       -7
24       11
25       -3
26       -6
27      -13
28       -7
29       13
         ..
12546     5
12547   -11
12548   -18
12549    -6
12550   -10
12551   -12
12552    -3
12553    18
12554    17
12555     6
12556    -2
12557   -10
12558     1
12559     0
12560     9
12561    20
12562     7
12563     4
12564   -19
12565     0
12566     7
12567    -5
12568    -8
12569    11
12570   -21
12571    -8
12572     7
12573    -9
12574     6
12575   -18
Name: score_1440, Length: 12576, dtype: int64

In [41]:
a = Data_X_train.as_matrix()
a[:,20147]

array([-13,  -5,   5, ...,  -9,   6, -18], dtype=int64)

## XGBoost classifier

In [17]:
xgb = GradientBoostingClassifier(max_depth=3, n_estimators = 1000)
xgb.fit(X_train, Y_train)
print('Training accurary : {0:.2f}'.format(xgb.score(X_train, Y_train)))
print('Validation accurary : {0:.2f}'.format(xgb.score(X_val, Y_val)))

Training accurary : 0.96
Validation accurary : 0.73


## Logistic Regression

In [16]:
LR = LogisticRegression(C=0.00001)
LR.fit(X_train, Y_train)
print('Training accurary : {0:.2f}'.format(LR.score(X_train, Y_train)))
print('Validation accurary : {0:.2f}'.format(LR.score(X_val, Y_val)))

Training accurary : 0.73
Validation accurary : 0.74


## Random Forest

In [45]:
# If the score difference is greater than 8 points (20147 = score_1140)
def postprocess(X, Y) :
    Y[X[:,20147] > 8] = 1
    Y[X[:,20147] <- 8] = 0
    return Y

**Grid Search**

In [9]:
# search_parameters = {'max_depth' : [None,2,3], 'n_estimators': [100,200,500], 'min_samples_split': [2,5,10],
#               'max_features' : ['auto',None], 'criterion' : ['gini', 'entropy']}
# RandomForest = RandomForestClassifier()
# clf = GridSearchCV(RandomForest, search_parameters)
# clf.fit(X_train, Y_train)
# model = clf.best_estimator_
# model

In [46]:
parameters = {'n_estimators': 150, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

In [47]:
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train, Y_train)
Y_train = postprocess(X_train, Y_train)
Y_val = postprocess(X_val, Y_val)
print('Training accurary : {0:.2f}'.format(RandomForest.score(X_train, Y_train)*100))
print('Validation accurary : {0:.2f}'.format(RandomForest.score(X_val, Y_val)*100))

Training accurary : 95.24
Validation accurary : 73.25
