# Modélisation : Test des différents algorithmes

In [12]:
import pandas as pd
import numpy as np
data = pd.read_csv('SeoulBikeData.csv', sep=',')

## Transformation des mots en chiffre

In [13]:
data['Holiday (int)'] = data['Holiday'].apply(lambda x: 0 if x == 'No Holiday' else 1)

data.loc[data['Seasons'] == 'Winter', 'Seasons (int)'] = 1
data.loc[data['Seasons'] == 'Spring', 'Seasons (int)'] = 2
data.loc[data['Seasons'] == 'Summer', 'Seasons (int)'] = 3
data.loc[data['Seasons'] == 'Autumn', 'Seasons (int)'] = 4
data['Date'] = pd.to_datetime(data['Date'], format="%d/%m/%Y")

data['Functioning Day (int)'] = data['Functioning Day'].apply(lambda x: 1 if x == 'Yes' else 0)

df = data.drop(['Holiday', 'Date', 'Seasons', 'Functioning Day'], axis=1)

df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday (int),Seasons (int),Functioning Day (int)
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,1.0,1
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,1.0,1
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,1.0,1
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,1.0,1
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,1.0,1


## Les données

Nous allons utilisé deux set d'entrée (X) sur les différents test.

- Le premier garde toutes les informations (colonnes) données par le dataset
- Le deuxième garde seulement les informations que nous avons déduit importante dans la partie Data-visualisation 

In [18]:
X = df.iloc[:, 1:13]
X2 = df.iloc[:, 1:13]
X2 = X2.drop(['Functioning Day (int)'], axis=1)
X2 = X2.drop(['Dew point temperature(°C)'], axis=1)
X2['working_day'] = (data['Date'].dt.dayofweek < 5).astype(np.int)
y = df.iloc[:, 0]

X = X.values
X2 = X2.values
y = y.values

Séparation en train et test set :

In [19]:
n_train = int(len(y)*0.75)
trainX , testX = X[:n_train , :] ,X[ n_train:, :]
trainX2 , testX2 = X2[:n_train , :] ,X2[ n_train:, :]
trainy , testy = y[:n_train ], y[ n_train:]

## Fonction pour different algorithm

In [6]:
from sklearn.model_selection import GridSearchCV
# Logistic Regression
from sklearn.linear_model import LogisticRegression
def MLogisticR(trainX,trainy):
    return LogisticRegression(random_state=0).fit(trainX, trainy)

# Linear Regression
from sklearn.linear_model import LinearRegression
def MLinearR(trainX, trainy):
    grid = GridSearchCV(LinearRegression(), {})
    grid.fit(trainX, trainy)
    return grid.best_estimator_

# SVM
from sklearn import svm
def MSVM(trainX, trainy):
    parameters = {  'C'       : [190, 200, 250, 260]   ,
                'kernel'  : ['rbf'],
                'gamma'   : [ 0.03],
                  "epsilon" : [1.5, 2, 2.5],
                 "degree" : [1,2]}
    grid = GridSearchCV(svm.SVR(), parameters, n_jobs=-1)
    grid.fit(trainX, trainy)
    return grid.best_estimator_

# SVC
def MSVC(trainX, trainy):
    parameters = {  "C"              : [1.0],#, 0.5, 0.1, 2]                , 
                "kernel"         : ['rbf'],#, 'poly', 'sigmoid']        ,
                "degree"         : [2],#, 3, 4,5,6]                     , 
                "gamma"          : [0.01],#, 0.1, 0.001, 0.5]           , 
                "coef0"          : [0.0]                             , 
                "shrinking"      : [True]                            , 
                "probability"    : [True]                            , 
                "tol"            : [0.001]                           , 
                "cache_size"     : [10]                              , 
                "class_weight"   : [None]                            , 
                "verbose"        : [False]                           , 
                "max_iter"       : [-1]                              , 
                "random_state"   : [None]                            }
    grid = GridSearchCV(svm.SVC(), parameters, n_jobs=-1)
    grid.fit(trainX, trainy)
    return grid.best_estimator_

# KNN
from sklearn.neighbors         import KNeighborsClassifier
def MKNN(trainX, trainy):
    parameters = {  "n_neighbors"    : [2, 3, 5, 8 , 10, 15, 20]                , 
                "weights"         : ['uniform', 'distance']                  ,
                "algorithm"      : ['auto', 'ball_tree', 'kd_tree', 'brute'] }
    grid = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1)
    grid.fit(trainX, trainy)
    return grid.best_estimator_

# Random Forest Classifier
from sklearn.ensemble          import RandomForestClassifier
def MRandomFC(trainX, trainy):
    grid = GridSearchCV(RandomForestClassifier(), {}, n_jobs=-1)
    grid.fit(trainX, trainy)
    return grid.best_estimator_

# Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
def MGradientBC(trainX, trainy):
    parameters = {'n_estimators': [10, 20, 30]           ,
                'learning_rate': [0.01, 0.1]                    }
    grid = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1)
    grid.fit(trainX, trainy)
    return grid.best_estimator_

## Score de chaque algorithme avec nos données

In [8]:
scoreFull = []
scorePart = []
axeX = ["Logistic Regression", "Linear Regression", "SVM", "SVC", "Knn", "Random Forest Classifier"]
modelFull = []
modelPart = []

In [9]:
temp = MLogisticR(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

temp = MLinearR(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

temp = MSVM(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

temp = MSVC(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

temp = MKNN(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

temp = MRandomFC(trainX, trainy)
scoreFull.append(temp.score(testX, testy))
modelFull.append(temp)

In [None]:
temp = MLogisticR(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

temp = MLinearR(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

temp = MSVM(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

temp = MSVC(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

temp = MKNN(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

temp = MRandomFC(trainX2, trainy)
scorePart.append(temp.score(testX2, testy))
modelPart.append(temp)

In [None]:
import matplotlib.pyplot as plt

plot = pd.DataFrame(np.c_[scoreFull, scorePart], index=axeX)
df.plot.bar()

plt.show()