# Discrete Wavelet Transform

In [3]:
import sys
sys.chdir('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import preprocess
import pywt

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, BatchNormalization, Dropout, Flatten, Dense, LSTM
from keras import losses

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

## Loading data

In [4]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

In [5]:
kept_features = ('ID','miss', 'score', 'assist', 'offensive rebound')
filter_col = [col for col in Data_X_train if col.startswith(kept_features)]
Data_X_train_reduced = Data_X_train[filter_col]
Data_X_train_reduced.head()

Unnamed: 0,ID,score_1,offensive rebound_1,assist_1,miss_1,score_2,offensive rebound_2,assist_2,miss_2,score_3,...,assist_1438,miss_1438,score_1439,offensive rebound_1439,assist_1439,miss_1439,score_1440,offensive rebound_1440,assist_1440,miss_1440
0,14186,-2,0,0,0,-2,0,0,0,-2,...,-4,9,-16,8,-4,9,-13,8,-3,9
1,13013,0,0,0,1,0,0,0,1,0,...,2,0,-5,1,1,0,-5,1,1,-1
2,7102,0,0,0,0,0,0,0,0,0,...,0,-5,5,-1,0,-5,5,-1,0,-5
3,7637,-2,0,0,0,-2,0,0,0,-2,...,-1,-1,-1,2,-1,-1,-1,2,-1,-1
4,12350,0,0,0,1,0,0,0,1,0,...,4,4,1,4,4,4,1,4,4,3


In [6]:
X, Y, _, _ = preprocess(Data_X_train_reduced, Data_Y_train, 1)
nb_games = len(X)

## K folds

In [7]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7)

In [8]:
def evaluate_model(model, X, Y, skf) :
    accs_train = []
    accs_val = []
    for train, val in skf.split(X, Y) :
        model.fit(X[train], Y[train])
        acc_train = model.score(X[train], Y[train])
        acc_val = model.score(X[val], Y[val])
        accs_train.append(acc_train)
        accs_val.append(acc_val)
        print('(Training, Validation) accuracies: ({0:.2f},{1:.2f})'.format(100*acc_train, 100*acc_val))

    print('Mean Training Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_train), 100*np.std(accs_train)))
    print('Mean Validation Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_val), 100*np.std(accs_val)))   

## Create Wavelet

In [12]:
def create_wavelet_features(wav_type, data_X_train, features, nb_freq, tw) :
    X_DWT = []
    for feat in features :
        filter_col = [col for col in Data_X_train if col.startswith(feat)]
        for i in range(int(1440/tw)) :        
            X_feat = data_X_train[filter_col].as_matrix()[:,i*tw:(i+1)*tw]
            cA, cD = pywt.dwt(X_feat, wav_type)
            X_DWT.append(cA[:,:nb_freq])
            X_DWT.append(cD[:,:nb_freq])
    X_wt = np.hstack(X_DWT)   
    return X_wt

# Mix features in frequence and time

In [10]:
tw = 240 # 4min
nb_freq = 100 # 50 first features

### Haar Wavelet

In [13]:
main_features = ['miss', 'score', 'assist', 'offensive rebound'] # , 'defensive rebound', 'offensive foul', 'defensive foul', 'lost ball', 'steals', 'bad pass', 'block']

X_wt = create_wavelet_features('haar', Data_X_train, main_features, nb_freq, tw)

### Logistic Regression

In [35]:
c = 1e-5
LR = LogisticRegression(C=c)
evaluate_model(LR, X_wt, Y, skf)

(Training, Validation) accuracies: (55.40,52.23)
(Training, Validation) accuracies: (55.28,51.19)
(Training, Validation) accuracies: (55.09,51.49)
(Training, Validation) accuracies: (55.32,52.68)
(Training, Validation) accuracies: (55.03,52.27)
Mean Training Accuracy: 55.22 +/- 0.14
Mean Validation Accuracy: 51.97 +/- 0.55


## Mix time and frequency features

**Aggregate time features by subsequence of 10 seconds**

In [14]:
X_seq = X.reshape((nb_games, 4, 10, -1), order = 'F')
X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games, 576)
X_tot = np.concatenate((X_seq, X_wt), axis = 1)

### Logistic Regression

In [15]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7)

In [16]:
c = 1e-5
LR = LogisticRegression(C=c)
evaluate_model(LR, X_tot, Y, skf)

(Training, Validation) accuracies: (72.52,71.46)
(Training, Validation) accuracies: (72.88,70.67)
(Training, Validation) accuracies: (72.60,71.61)
(Training, Validation) accuracies: (71.91,73.76)
(Training, Validation) accuracies: (72.51,72.35)
Mean Training Accuracy: 72.49 +/- 0.32
Mean Validation Accuracy: 71.97 +/- 1.04


### Random Forest CLassifier

In [None]:
rdmforest = RandomForestClassifier(n_estimators = 1000, max_depth = 6)
evaluate_model(rdmforest, X_tot, Y, skf)

(Training, Validation) accuracies: (74.25,72.62)
(Training, Validation) accuracies: (74.26,71.58)
(Training, Validation) accuracies: (74.49,71.81)
(Training, Validation) accuracies: (74.35,71.57)
(Training, Validation) accuracies: (74.58,70.88)
Mean Training Accuracy: 74.39 +/- 0.13
Mean Validation Accuracy: 71.69 +/- 0.56


### MLP CLassifier

In [17]:
MLP = MLPClassifier(hidden_layer_sizes = (50,20), 
                     alpha = 10)
evaluate_model(MLP, X_tot, Y, skf)

(Training, Validation) accuracies: (71.99,70.63)
(Training, Validation) accuracies: (72.29,70.03)
(Training, Validation) accuracies: (71.63,71.09)
(Training, Validation) accuracies: (71.95,71.85)
(Training, Validation) accuracies: (71.93,71.64)
Mean Training Accuracy: 71.96 +/- 0.21
Mean Validation Accuracy: 71.05 +/- 0.66


## Daubechies Wavelet

In [18]:
main_features = ['miss', 'score', 'assist', 'offensive rebound'] # , 'defensive rebound', 'offensive foul', 'defensive foul', 'lost ball', 'steals', 'bad pass', 'block']

X_wt = create_wavelet_features('db1', Data_X_train, main_features, nb_freq, tw)

In [19]:
X_seq = X.reshape((nb_games, 4, 10, -1), order = 'F')
X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games, 576)
X_tot = np.concatenate((X_seq, X_wt), axis = 1)

### Random Forest Classifier

In [20]:
rdmforest = RandomForestClassifier(n_estimators = 1000, max_depth = 6)
evaluate_model(rdmforest, X_tot, Y, skf)

(Training, Validation) accuracies: (74.62,71.46)
(Training, Validation) accuracies: (74.52,70.87)
(Training, Validation) accuracies: (74.56,71.93)
(Training, Validation) accuracies: (73.97,72.25)
(Training, Validation) accuracies: (74.31,72.12)
Mean Training Accuracy: 74.40 +/- 0.24
Mean Validation Accuracy: 71.72 +/- 0.50


### MLP Classifier

In [21]:
MLP = MLPClassifier(hidden_layer_sizes = (50,20), 
                     alpha = 10)
evaluate_model(MLP, X_tot, Y, skf)

(Training, Validation) accuracies: (72.20,71.94)
(Training, Validation) accuracies: (71.96,69.52)
(Training, Validation) accuracies: (71.66,70.58)
(Training, Validation) accuracies: (70.21,71.41)
(Training, Validation) accuracies: (71.45,71.24)
Mean Training Accuracy: 71.50 +/- 0.69
Mean Validation Accuracy: 70.94 +/- 0.83


### Logistic Regression

In [22]:
c = 1e-5
LR = LogisticRegression(C=c)
evaluate_model(LR, X_tot, Y, skf)

(Training, Validation) accuracies: (72.52,71.46)
(Training, Validation) accuracies: (72.88,70.67)
(Training, Validation) accuracies: (72.60,71.61)
(Training, Validation) accuracies: (71.91,73.76)
(Training, Validation) accuracies: (72.51,72.35)
Mean Training Accuracy: 72.49 +/- 0.32
Mean Validation Accuracy: 71.97 +/- 1.04


## Biorthogonal Wavelet

In [23]:
main_features = ['miss', 'score', 'assist', 'offensive rebound']
X_wt = create_wavelet_features('db1', Data_X_train, main_features, nb_freq, tw)

In [24]:
X_seq = X.reshape((nb_games, 4, 10, -1), order = 'F')
X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games, 576)
X_tot = np.concatenate((X_seq, X_wt), axis = 1)

### Random Forest Classifier

In [25]:
rdmforest = RandomForestClassifier(n_estimators = 1000, max_depth = 6)
evaluate_model(rdmforest, X_tot, Y, skf)

(Training, Validation) accuracies: (74.64,71.50)
(Training, Validation) accuracies: (74.52,71.03)
(Training, Validation) accuracies: (74.54,71.89)
(Training, Validation) accuracies: (74.11,72.64)
(Training, Validation) accuracies: (74.29,72.16)
Mean Training Accuracy: 74.42 +/- 0.19
Mean Validation Accuracy: 71.84 +/- 0.55


### MLP CLassifier

In [26]:
MLP = MLPClassifier(hidden_layer_sizes = (50,20), 
                     alpha = 10)
evaluate_model(MLP, X_tot, Y, skf)

(Training, Validation) accuracies: (72.62,71.50)
(Training, Validation) accuracies: (71.99,69.40)
(Training, Validation) accuracies: (71.89,70.78)
(Training, Validation) accuracies: (71.56,73.28)
(Training, Validation) accuracies: (71.81,71.12)
Mean Training Accuracy: 71.98 +/- 0.35
Mean Validation Accuracy: 71.22 +/- 1.25


### Logistic Regression

In [27]:
c = 1e-5
LR = LogisticRegression(C=c)
evaluate_model(LR, X_tot, Y, skf)

(Training, Validation) accuracies: (72.52,71.46)
(Training, Validation) accuracies: (72.88,70.67)
(Training, Validation) accuracies: (72.60,71.61)
(Training, Validation) accuracies: (71.91,73.76)
(Training, Validation) accuracies: (72.51,72.35)
Mean Training Accuracy: 72.49 +/- 0.32
Mean Validation Accuracy: 71.97 +/- 1.04
