# ESILV - Python for data analysis - project 2022

In [1]:
#Import
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import seaborn as sns

In [2]:
#Charge the Dataset
df = pd.read_csv("SeoulBikeData.csv", encoding="latin1")
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


# 1) Clear Data

Dataset : 1 years of rented bike

Target : Have the good renter number of bike by hours so we going to try linear regression

In [3]:
#Check is Nan value in the dataframe
check_for_nan = df.isnull().values.any()
print (check_for_nan)

False


In [4]:
#Create of a correlation array
df.corr().style.background_gradient(cmap='Blues')

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
Rented Bike Count,1.0,0.410257,0.538558,-0.19978,0.121108,0.19928,0.379788,0.261837,-0.123074,-0.141804
Hour,0.410257,1.0,0.124114,-0.241644,0.285197,0.098753,0.003054,0.145131,0.008715,-0.021516
Temperature(°C),0.538558,0.124114,1.0,0.159371,-0.036252,0.034794,0.912798,0.353505,0.050282,-0.218405
Humidity(%),-0.19978,-0.241644,0.159371,1.0,-0.336683,-0.54309,0.536894,-0.461919,0.236397,0.108183
Wind speed (m/s),0.121108,0.285197,-0.036252,-0.336683,1.0,0.171507,-0.176486,0.332274,-0.019674,-0.003554
Visibility (10m),0.19928,0.098753,0.034794,-0.54309,0.171507,1.0,-0.17663,0.149738,-0.167629,-0.121695
Dew point temperature(°C),0.379788,0.003054,0.912798,0.536894,-0.176486,-0.17663,1.0,0.094381,0.125597,-0.150887
Solar Radiation (MJ/m2),0.261837,0.145131,0.353505,-0.461919,0.332274,0.149738,0.094381,1.0,-0.07429,-0.072301
Rainfall(mm),-0.123074,0.008715,0.050282,0.236397,-0.019674,-0.167629,0.125597,-0.07429,1.0,0.0085
Snowfall (cm),-0.141804,-0.021516,-0.218405,0.108183,-0.003554,-0.121695,-0.150887,-0.072301,0.0085,1.0


In [5]:
#transform quantitative variable to vector
df['Functioning Day'].unique()
df['Functioning Day'] = df['Functioning Day'].replace(to_replace=['No', 'Yes'], value=[0, 1])

df['Holiday'].unique()
df['Holiday'] = df['Holiday'].replace(to_replace=['No Holiday', 'Holiday'], value=[0, 1])

df['Seasons'].unique()
df['Seasons'] = df['Seasons'].replace(to_replace=['Winter', 'Spring', 'Summer', 'Autumn'], value=[0, 1, 2, 3])

In [6]:
#transform date columns
df['Date']= pd.to_datetime(df['Date'],format='%d/%m/%Y').dt.date
df['Date'] = pd.to_datetime(df['Date'])

In [7]:
#Verif type of columns
df.dtypes

Date                         datetime64[ns]
Rented Bike Count                     int64
Hour                                  int64
Temperature(°C)                     float64
Humidity(%)                           int64
Wind speed (m/s)                    float64
Visibility (10m)                      int64
Dew point temperature(°C)           float64
Solar Radiation (MJ/m2)             float64
Rainfall(mm)                        float64
Snowfall (cm)                       float64
Seasons                               int64
Holiday                               int64
Functioning Day                       int64
dtype: object

In [9]:
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,0,1
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,0,1
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,0,1
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,0,1
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,0,1


# 2) Data-visualization

In [20]:
#We create sum dataframe for scattler plot
df2 = df.groupby('Date').agg({'Rented Bike Count': ['sum'], 'Temperature(°C)': ['mean'], 'Humidity(%)': ['mean'], 'Wind speed (m/s)': ['mean'], 'Visibility (10m)': ['mean'], 'Dew point temperature(°C)': ['mean'], 'Solar Radiation (MJ/m2)': ['mean'], 'Rainfall(mm)': ['mean'], 'Snowfall (cm)': ['mean'], 'Seasons': ['mean'], 'Holiday': ['mean'], 'Functioning Day': ['mean']})
df2.head()

Unnamed: 0_level_0,Rented Bike Count,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
Unnamed: 0_level_1,sum,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2017-12-01,9539,-2.454167,45.875,1.5375,1870.75,-13.545833,0.24875,0.0,0.0,0,0,1.0
2017-12-02,8523,1.325,61.958333,1.7125,1471.083333,-5.716667,0.26375,0.0,0.0,0,0,1.0
2017-12-03,7222,4.875,81.541667,1.6125,455.75,1.883333,0.125417,0.166667,0.0,0,0,1.0
2017-12-04,8729,-0.304167,52.5,3.45,1362.833333,-9.925,0.282917,0.004167,0.0,0,0,1.0
2017-12-05,8307,-4.458333,36.416667,1.108333,1959.458333,-17.425,0.035833,0.0,0.0,0,0,1.0


In [10]:
#Faire une ACP entre Temperature et Dew point temperature
#contribution des variables

#The variables Humidity , Rainfall and Snowfall
#Voir Screeplot

#Filter useless var of dataset
#find correlation with rented bike count
#uselesscolum = ['Humidity(%)','Dew point temperature(°C)']
#df.drop(uselesscolum,axis=1, inplace=True)
#df

#Faire un grid search pour comparer plusieur model
#faire l'api que pour le meillieur model
#ppt expliquer la démarche, pas bcp d'explications

In [12]:
#boosting = regression 
#random forest = classification