In [2]:
import pandas as pd
import numpy as np

In [3]:
# import xslx file
url = r'C:\Users\rergu\OneDrive\IronHack\MiniProjects\Week3-Project\tfl-daily-cycle-hires.xlsx'
df_bike = pd.read_excel(url, sheet_name='Data')
df_bike.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
df_bike.head()

Unnamed: 0,Day,Number of Bicycle Hires
0,2015-01-01,9615
1,2015-01-02,15389
2,2015-01-03,5779
3,2015-01-04,9367
4,2015-01-05,20566


In [4]:
df_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Day                      1827 non-null   datetime64[ns]
 1   Number of Bicycle Hires  1827 non-null   int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 28.7 KB


In [5]:
#rename the columns
df_bike.columns = [col.lower() for col in df_bike.columns]
df_bike.columns = [col.replace(" ", "_") for col in df_bike.columns]
df_bike.rename(columns= {
    'day': 'date'
}, inplace=True)

In [6]:
# get day, month and year from date
df_bike['day'] = df_bike['date'].dt.day
df_bike['month'] = df_bike['date'].dt.month
df_bike['year'] = df_bike['date'].dt.year

In [None]:
# group by month and day
grouped_df_bike = df_bike.groupby(['month', 'day']).agg(
    number_of_bicycle_hires_mean=('number_of_bicycle_hires', 'mean'),
    number_of_bicycle_hires_std=('number_of_bicycle_hires', 'std')
).reset_index()


Unnamed: 0,month,day,number_of_bicycle_hires_mean,number_of_bicycle_hires_std
95,4,5,28373.8,5371.457037
53,2,23,24503.6,3050.412316
155,6,4,33574.0,3940.352713
93,4,3,23325.0,8356.773031
171,6,20,34303.8,9299.426472
29,1,30,23277.2,4447.01863
18,1,19,23278.0,3976.571262
59,2,29,23738.0,
4,1,5,20549.2,3887.664427
119,4,29,25975.0,6514.950806


In [None]:
#index of 
#grouped_df_bike = grouped_df_bike.set_index(['month', 'day'])

In [None]:
# group by year, month and day
grouped_year_df_bike = df_bike.groupby(['year', 'month', 'day']).agg(
    number_of_bicycle_hires_sum=('number_of_bicycle_hires', 'sum')
).reset_index()



In [11]:
# data cleaning and finetuning 
grouped_year_df_bike.rename(columns=({'number_of_bicycle_hires_sum': 'bicycle_hires'}), inplace=True)
grouped_df_bike.rename(columns=({'number_of_bicycle_hires_mean': 'bicycle_hires_ave', 'number_of_bicycle_hires_std': 'bicycle_hires_std'}), inplace=True)
grouped_df_bike['bicycle_hires_ave'] = grouped_df_bike['bicycle_hires_ave'].apply(round)


grouped_df_bike['bicycle_hires_std'] = grouped_df_bike['bicycle_hires_std'].apply(round)

In [10]:
# drop feb 29
condition1 = grouped_year_df_bike['month'] == 2
condition2 = grouped_year_df_bike['day'] == 29

index_feb29 = grouped_year_df_bike[condition1 & condition2].index

grouped_year_df_bike.drop(index_feb29, inplace=True)
grouped_df_bike.dropna(subset='bicycle_hires_std', inplace=True)

grouped_df_bike.reset_index(drop=True, inplace=True)
grouped_year_df_bike.reset_index(drop=True, inplace=True)

In [12]:
# get the index of na row -->
grouped_df_bike.loc[pd.isna(grouped_df_bike["bicycle_hires_std"]), :].index

Index([], dtype='int64')

In [13]:
grouped_year_df_bike.head(10)

Unnamed: 0,year,month,day,bicycle_hires
0,2015,1,1,9615
1,2015,1,2,15389
2,2015,1,3,5779
3,2015,1,4,9367
4,2015,1,5,20566
5,2015,1,6,20765
6,2015,1,7,22504
7,2015,1,8,15769
8,2015,1,9,22285
9,2015,1,10,14877


In [14]:
grouped_df_bike.head(10)

Unnamed: 0,month,day,bicycle_hires_ave,bicycle_hires_std
0,1,1,10268,2463
1,1,2,13746,4598
2,1,3,14422,8329
3,1,4,18108,5169
4,1,5,20549,3888
5,1,6,18224,4062
6,1,7,18507,5652
7,1,8,20271,5748
8,1,9,24650,9447
9,1,10,22048,6785


In [15]:
#merge the two dataframes
merged_df_bike = pd.merge(grouped_year_df_bike, grouped_df_bike, on=['month', 'day'], how='left')

In [17]:
#calculate Z-score
"""Compute Z-Scores (Standardized Deviation)
Z-score tells you how many standard deviations away a particular day's value is from the mean:
Z-score = (actual_value - mean)/std

Interpretations -->
z ≈ 0: The day is typical (very close to average).
|z| > 1: The day is somewhat unusual.
|z| > 2: The day is statistically significant — quite rare (could indicate an event, weather impact, etc.).
z < 0: Fewer hires than usual.
z > 0: More hires than usual.
"""

merged_df_bike["z_score"] = (merged_df_bike["bicycle_hires"] - merged_df_bike["bicycle_hires_ave"]
) / merged_df_bike["bicycle_hires_std"]

# Z-score interpretations

def z_score_interpret(z_score):
    if z_score == 0:
        return 'The day is typical'
    elif abs(z_score) > 1:
        return 'The day is unusual'
    elif abs(z_score) > 2:
        return 'The day is significantly unusual'
    elif z_score < 0:
        return 'Fewer hires than usual'
    elif z_score > 0:
        return 'More hires than usual'
    else:
        return 'Nan'

# call it with function 

merged_df_bike['z_score_interpretation'] = merged_df_bike['z_score'].apply(z_score_interpret)

In [18]:
# calculate deviation from the average in percentage and round it 2 decimal places
merged_df_bike['deviation_from_average_pct'] = round(((merged_df_bike['bicycle_hires'] - merged_df_bike['bicycle_hires_ave']) / merged_df_bike['bicycle_hires_ave'] * 100),2)

In [19]:
merged_df_bike.sample(10)

Unnamed: 0,year,month,day,bicycle_hires,bicycle_hires_ave,bicycle_hires_std,z_score,z_score_interpretation,deviation_from_average_pct
671,2016,11,3,31111,28047,6411,0.477929,More hires than usual,10.92
1807,2019,12,14,14862,23221,5167,-1.617767,The day is unusual,-36.0
968,2017,8,27,33657,31965,4582,0.369271,More hires than usual,5.29
539,2016,6,24,36530,36717,3755,-0.0498,Fewer hires than usual,-0.51
185,2015,7,5,29963,40694,6288,-1.706584,The day is unusual,-26.37
745,2017,1,16,19230,21961,4240,-0.644104,Fewer hires than usual,-12.44
1634,2019,6,24,38617,36717,3755,0.505992,More hires than usual,5.17
669,2016,11,1,32334,27754,5406,0.847207,More hires than usual,16.5
1055,2017,11,22,31315,24644,6838,0.975578,More hires than usual,27.07
1430,2018,12,2,15990,22876,5918,-1.163569,The day is unusual,-30.1


In [29]:
merged_df_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   year                        1826 non-null   int32  
 1   month                       1826 non-null   int32  
 2   day                         1826 non-null   int32  
 3   bicycle_hires               1826 non-null   int64  
 4   bicycle_hires_ave           1826 non-null   int64  
 5   bicycle_hires_std           1826 non-null   int64  
 6   z_score                     1826 non-null   float64
 7   z_score_interpretation      1826 non-null   object 
 8   deviation_from_average_pct  1826 non-null   float64
dtypes: float64(2), int32(3), int64(3), object(1)
memory usage: 107.1+ KB


In [None]:
# day of week can impact so take a look