# Logistic Regression

In [1]:
import sys
sys.chdir('../')

import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from src.utils import preprocess
from tqdm import tqdm
random.seed(7)

%matplotlib inline
import matplotlib.pyplot as plt



## Loading data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

**Data preprocessing : split train/validation and remove ID**

In [3]:
X_train, Y_train, X_val, Y_val = preprocess(Data_X_train, Data_Y_train, 0.7)

## Model evaluation

In [4]:
C = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10, 100, 1000]
for c in tqdm(C) :
    model = LogisticRegression(C=c)
    model.fit(X_train, Y_train)
    score = model.score(X_val, Y_val)
    print("C = {} - Validation Accuray : {:.1f}%".format(c,score*100))

  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

C = 1e-10 - Validation Accuray : 69.1%


  7%|█████▉                                                                             | 1/14 [00:17<03:45, 17.32s/it]

C = 1e-09 - Validation Accuray : 69.2%


 14%|███████████▊                                                                       | 2/14 [00:23<02:22, 11.86s/it]

C = 1e-08 - Validation Accuray : 70.3%


 21%|█████████████████▊                                                                 | 3/14 [00:30<01:52, 10.26s/it]

C = 1e-07 - Validation Accuray : 71.8%


 29%|███████████████████████▋                                                           | 4/14 [00:39<01:37,  9.77s/it]

C = 1e-06 - Validation Accuray : 72.9%


 36%|█████████████████████████████▋                                                     | 5/14 [00:52<01:35, 10.57s/it]

C = 1e-05 - Validation Accuray : 73.0%


 43%|███████████████████████████████████▌                                               | 6/14 [01:20<01:47, 13.46s/it]

C = 0.0001 - Validation Accuray : 72.3%


 50%|█████████████████████████████████████████▌                                         | 7/14 [02:23<02:23, 20.46s/it]

C = 0.001 - Validation Accuray : 71.7%


 57%|███████████████████████████████████████████████▍                                   | 8/14 [04:50<03:37, 36.27s/it]

C = 0.01 - Validation Accuray : 69.4%


 64%|█████████████████████████████████████████████████████▎                             | 9/14 [11:11<06:13, 74.65s/it]

C = 0.1 - Validation Accuray : 67.5%


 71%|██████████████████████████████████████████████████████▎                     | 10/14 [8:14:00<3:17:36, 2964.09s/it]

C = 1 - Validation Accuray : 66.4%


 79%|███████████████████████████████████████████████████████████▋                | 11/14 [8:53:19<2:25:27, 2909.00s/it]

C = 10 - Validation Accuray : 65.8%


 86%|█████████████████████████████████████████████████████████████████▏          | 12/14 [9:26:23<1:34:23, 2831.95s/it]

C = 100 - Validation Accuray : 65.7%


 93%|████████████████████████████████████████████████████████████████████████▍     | 13/14 [9:54:07<45:42, 2742.12s/it]

C = 1000 - Validation Accuray : 65.6%


100%|█████████████████████████████████████████████████████████████████████████████| 14/14 [10:27:57<00:00, 2691.24s/it]


## Cross Validation

In [5]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(C=1e-5), X_train, Y_train, scoring='accuracy', cv=10)
print(scores)

[ 0.70715096  0.70942111  0.723042    0.74318182  0.73068182  0.70795455
  0.70340909  0.71818182  0.725       0.72954545]


## Penalty Test

Test with a l1-penalty

In [6]:
c = 1e-5
model = LogisticRegression(C=c, penalty = 'l1')
model.fit(X_train, Y_train)
score = model.score(X_val, Y_val)
print("C = {} - Validation Accuray : {:.1f}%".format(c,score*100))

C = 1e-05 - Validation Accuray : 57.2%
