In [1]:
import pandas as pd
import numpy as np
import datetime
import altair as alt

In [2]:
df = pd.read_csv('bike.csv')
df.datetime = pd.to_datetime(df.datetime)
df['hour'] = df.datetime.dt.hour
df['month'] = df.datetime.dt.month
df['year'] = df.datetime.dt.year
df['first_day_of_month'] = df.datetime.apply(lambda date : datetime.date(date.year, date.month, 1))

In [3]:
dico_indx_weekday = {0 : 'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

In [4]:
df['weekday'] = df.datetime.apply(lambda x : dico_indx_weekday[x.weekday()])

In [5]:
df['weekday_index'] = df.datetime.apply(lambda x : x.weekday())

In [6]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,year,first_day_of_month,weekday,weekday_index
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,2011,2011-01-01,Saturday,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,2011,2011-01-01,Saturday,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,2011,2011-01-01,Saturday,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,2011,2011-01-01,Saturday,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,2011,2011-01-01,Saturday,5


In [24]:
df_ = df[df.hour==17][['weekday','hour','casual','registered']]
df_

Unnamed: 0,weekday,hour,casual,registered
17,Saturday,17,15,52
40,Sunday,17,7,58
62,Monday,17,11,146
85,Tuesday,17,10,202
108,Wednesday,17,4,186
...,...,...,...,...
10783,Saturday,17,35,274
10807,Sunday,17,29,214
10831,Monday,17,17,478
10855,Tuesday,17,39,533


In [23]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Violin(x=df['weekday'],
                        y=df['registered'],
                        legendgroup='Registered', 
                        scalegroup='Registered', 
                        name='Registered',
                        side='positive',
                        line_color='orange')
             )

fig.add_trace(go.Violin(x=df['weekday'],
                        y=df['casual'],
                        legendgroup='Casual', 
                        scalegroup='Casual', 
                        name='Casual',
                        side='negative',
                        line_color='blue')
             )


fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.show()

In [9]:
df[(df.month==1) & (df.year==2011)]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,year,first_day_of_month,weekday,weekday_index
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,0,1,2011,2011-01-01,Saturday,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,1,1,2011,2011-01-01,Saturday,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2,1,2011,2011-01-01,Saturday,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,3,1,2011,2011-01-01,Saturday,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,4,1,2011,2011-01-01,Saturday,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426,2011-01-19 19:00:00,1,0,1,1,13.12,14.395,57,27.9993,4,108,112,19,1,2011,2011-01-01,Wednesday,2
427,2011-01-19 20:00:00,1,0,1,1,13.12,15.150,49,19.9995,2,74,76,20,1,2011,2011-01-01,Wednesday,2
428,2011-01-19 21:00:00,1,0,1,1,13.12,14.395,49,27.9993,4,55,59,21,1,2011,2011-01-01,Wednesday,2
429,2011-01-19 22:00:00,1,0,1,1,12.30,15.150,52,11.0014,6,53,59,22,1,2011,2011-01-01,Wednesday,2


In [10]:
df__ = (pd.DataFrame(df.groupby(['first_day_of_month'])
        .agg({'casual':'mean','registered':'mean'})
        .astype('int').stack())
        .reset_index()
        .rename(columns={'level_0':'month','level_1':'client_type',0:'number'})
        )
df__.first_day_of_month = pd.to_datetime(df__.first_day_of_month)

In [11]:
df.temp.min()

0.82

In [12]:
df.weather.value_counts()

1    7192
2    2834
3     859
4       1
Name: weather, dtype: int64

In [13]:
df[df.month==1].weather.mode().iloc[0]

1

In [14]:
area = alt.Chart(df__).mark_area(opacity=0.3).encode(
            x=alt.X('first_day_of_month:T',title='Month'),
            y=alt.Y("number:Q", title='Number of bikes rent', stack=None),
            color="client_type:N"
        ).properties(
                
                width=1000,
                height=300,
            )

area

In [15]:
df__ = (pd.DataFrame(df[df.year==2012].groupby(['first_day_of_month'])
        .agg({'casual':'mean','registered':'mean'})
        .astype('int').stack())
        .reset_index()
        .rename(columns={'level_0':'month','level_1':'client_type',0:'number'})
        )
df__.first_day_of_month = pd.to_datetime(df__.first_day_of_month)

area = alt.Chart(df__).mark_area(opacity=0.3).encode(
            x=alt.X('month(first_day_of_month):T',title='Month'),
            y=alt.Y("number:Q", title='Number of bikes rent', stack=None),
            color="client_type:N"
        )

area

In [16]:
area = alt.Chart(df__).mark_area(opacity=0.3).encode(
            x=alt.X('month(first_day_of_month):T',title='Month'),
            y=alt.Y("number:Q", title='Number of bikes rent', stack=None),
            color="client_type:N"
        )

area

In [17]:
xrule = alt.Chart().mark_rule(color="red", strokeWidth=2).encode(x=alt.datum(alt.DateTime( month="March")))

In [18]:
area + xrule

In [19]:
list_months = ['January','February','March','April','May','June','July','August','September','October','November','December']


In [20]:
dico_indx_month = {i:m for i,m in zip(list(range(1,13)),list_months)}
dico_month_indx = {m:i for i,m in zip(list(range(1,13)),list_months)}

In [21]:
alt.Chart(df_2).mark_line(point = False).encode(
    x = alt.X("hour:O", title="Nour of day"),
    y= alt.Y("number:Q",title="Number of bikes rent"),
    color=alt.Color("client_type:N",title='Client Type')
).properties(
    title="Number of bikes rent hourly in "+dico_indx_month[month],
    width=600,
    height=150,
)

NameError: name 'df_2' is not defined

In [None]:
df.temp.quantile(0.25)

13.94

In [None]:
def filter_dataset(month = None,
                   year = None,
                   holiday = None,
                   weekend = None,
                   weather = None,
                   temperature = None,
                   humidity = None,
                   windspeed = None):
    df_result = df.copy()
    if month is not None :
        df_result = df_result[df_result.month == month]
    if year is not None :
        df_result = df_result[df_result.year == year]    
    if holiday is not None :
        df_result = df_result[df_result.holiday == holiday]
    if weekend is not None :
        df_result = df_result[df_result.workingday != weekend]
    if weather is not None :
        df_result = df_result[df_result.weather == weather]    
    if temperature is not None :
        df_result = df_result[(df_result.temp < temperature + 0.5) & (df_result.temp > temperature - 0.5) ] 
    if humidity is not None :
        df_result = df_result[(df_result.humidity < humidity + 0.5) & (df_result.humidity > humidity - 0.5) ] 
    if windspeed is not None :
        df_result = df_result[(df_result.windspeed < windspeed + 0.5) & (df_result.windspeed > windspeed - 0.5) ] 
    return df_result

In [None]:
df[(df.month==5)&(df.year==2011)].describe().loc['25%']

season           2.0000
holiday          0.0000
workingday       0.0000
weather          1.0000
temp            19.6800
atemp           23.4850
humidity        58.7500
windspeed        7.0015
casual           8.0000
registered      39.0000
count           50.0000
month            5.0000
year          2011.0000
Name: 25%, dtype: float64

In [None]:
df_filtered = filter_dataset(month = 4,
                   year = 2011,
                   holiday = None,
                   weekend = None,
                   weather = None,
                   temperature = None,
                   humidity = None,
                   windspeed = None)
df_filtered                   

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,year,first_day_of_month
1323,2011-04-01 00:00:00,2,0,1,3,10.66,12.880,100,11.0014,0,6,6,4,2011,2011-04-01
1324,2011-04-01 01:00:00,2,0,1,3,10.66,12.880,100,11.0014,0,4,4,4,2011,2011-04-01
1325,2011-04-01 02:00:00,2,0,1,3,10.66,12.880,93,12.9980,0,7,7,4,2011,2011-04-01
1326,2011-04-01 03:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,4,4,4,2011,2011-04-01
1327,2011-04-01 04:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,3,3,4,2011,2011-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,2011-04-19 19:00:00,2,0,1,2,20.50,24.240,72,12.9980,28,248,276,4,2011,2011-04-01
1774,2011-04-19 20:00:00,2,0,1,2,20.50,24.240,72,15.0013,20,148,168,4,2011,2011-04-01
1775,2011-04-19 21:00:00,2,0,1,2,20.50,24.240,72,8.9981,8,114,122,4,2011,2011-04-01
1776,2011-04-19 22:00:00,2,0,1,1,19.68,23.485,77,6.0032,3,109,112,4,2011,2011-04-01


In [None]:
df_filtered.temp.describe()

count    76.000000
mean     16.583421
std       3.346561
min       9.840000
25%      13.940000
50%      16.810000
75%      18.860000
max      25.420000
Name: temp, dtype: float64

In [None]:
def df_month(month):
    return pd.DataFrame(df[df.month==month].groupby('hour').agg({'casual':'mean','registered':'mean'}).stack()).reset_index().rename(columns={'level_1':'client_type',0:'number'})

In [None]:
df_2 = df_month(2)
df_2

Unnamed: 0,hour,client_type,number
0,0,casual,2.973684
1,0,registered,25.868421
2,1,casual,1.763158
3,1,registered,15.131579
4,2,casual,1.513514
5,2,registered,10.945946
6,3,casual,0.852941
7,3,registered,4.970588
8,4,casual,0.242424
9,4,registered,2.393939


In [None]:
df_2 = df[df.month==3].groupby('hour').agg({'casual':'mean','registered':'mean'}).reset_index().rename(columns={'index':'hour'})
df_2

Unnamed: 0,hour,casual,registered
0,0,5.289474,27.578947
1,1,4.0,18.815789
2,2,3.352941,10.882353
3,3,1.388889,6.916667
4,4,1.117647,3.0
5,5,0.837838,11.621622
6,6,1.947368,49.473684
7,7,8.184211,144.368421
8,8,15.684211,284.315789
9,9,19.447368,167.0
