#### This dataset contains the hourly and daily count of rental bikes between the years 2011 and 2012 in the Capital bike share system with the corresponding weather and seasonal information.

  Data Dictionary

  1. datetime - hourly date + timestamp
  2. season - 1 = spring | 2 = summer | 3 = fall | 4 = winter
  3. holiday - whether the day is considered a holiday
  4. workingday - whether the day is neither a weekend nor holiday
  5. weather -
         1: Clear, Few clouds, Partly cloudy, Partly cloudy

         2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

         3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

         4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
  6. temp - temperature in Celsius
  7. atemp - "feels like" temperature in Celsius
  8. humidity - relative humidity
  9. windspeed - wind speed
  10. casual - number of non-registered user rentals initiated
  11. registered - number of registered user rentals initiated
  12. count - number of total rentals (Dependent Variable)


  `For further information visit the link:`
   1.  https://www.kaggle.com/competitions/bike-sharing-demand/data
   2.  https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset

##### Data Preparation

In [1]:
#importing the required libraries
import pandas as pd
import numpy as np
import io
import requests
import zipfile
from datetime import datetime, time
import calendar

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_log_error
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.ensemble import RandomForestRegressor


import joblib
from IPython.display import display
import ipywidgets as widgets
import os
import shutil

In [2]:
# upload the Kaggle API key (kaggle.json) generated from your Kaggle account
uploader = widgets.FileUpload()
display(uploader)

FileUpload(value={}, description='Upload')

In [20]:
def getDataFilesFromKaggle():
  # Get the uploaded file contents
  uploaded_file = list(uploader.value.values())[0]
  content = uploaded_file['content']

  # Write the file to disk
  with open('kaggle.json', 'wb') as f:
      f.write(content)

  # Create the ~/.kaggle directory if it doesn't exist
  os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

  # Move kaggle.json to ~/.kaggle/
  shutil.move('kaggle.json', os.path.expanduser('~/.kaggle/kaggle.json'))

  # Set file permissions to be secure
  os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

  !pip install -q kaggle
  !kaggle competitions download -c bike-sharing-demand

   # print the contents of the zip file with directory structure
  with zipfile.ZipFile("bike-sharing-demand.zip", 'r') as zip_ref:
    zip_ref.printdir()

  # Extract the contents of the zip file into the data folder
  os.makedirs("./data/input_bike_data", exist_ok=True) # Create the data folder if it doesn't exist
  with zipfile.ZipFile("bike-sharing-demand.zip", 'r') as zip_ref:
    zip_ref.extractall("./data/input_bike_data")

  # Delete the zip file
  os.remove("bike-sharing-demand.zip")

  print("Data files downloaded and extracted successfully!")

In [21]:
# getting the datasets from the Kaggle downloaded files into dataframes for processing
getDataFilesFromKaggle()
train=pd.read_csv('./data/input_bike_data/train.csv')
test=pd.read_csv('./data/input_bike_data/test.csv')

Downloading bike-sharing-demand.zip to /content
  0% 0.00/189k [00:00<?, ?B/s]
100% 189k/189k [00:00<00:00, 366MB/s]
File Name                                             Modified             Size
sampleSubmission.csv                           2019-12-11 03:40:36       142861
test.csv                                       2019-12-11 03:40:36       323856
train.csv                                      2019-12-11 03:40:36       648353
Data files downloaded and extracted successfully!


In [22]:
# checking the data
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [25]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


#### We see that the test set is void of the target feature and accompanying columns `casual`and 	`registered`. Also it is observed the sum of these 2 columns is the target feature and hence both are data leakage channel. So we drop them

In [27]:
del train['casual']
del train['registered']

### Now we merge both the datasets into one and will sort the complete dataset by datatime

In [28]:
full_kaggle_df = pd.concat([train, test], ignore_index=True)

In [29]:
# Convert to datetime type (if not already)
full_kaggle_df['datetime'] = pd.to_datetime(full_kaggle_df['datetime'])

In [30]:
# Sort by datetime
full_kaggle_df = full_kaggle_df.sort_values(by='datetime').reset_index(drop=True)

#### Since tha test set didn't have count values, we observe that in the `full_kaggle_df` the corresponding rows have NaN as count values. So bring in the UCI dataset for extracting these count values from there.

In [31]:
def getDataFileFromUCI():
  # dowloading the zipped dataset from the UCI repository.
  downloaded_zipped = requests.get("https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip").content

  # unzip the zipped folder and extract & read the hour.csv into a dataframe
  with zipfile.ZipFile(io.BytesIO(downloaded_zipped)) as unzp:
    ori_df = pd.read_csv(unzp.open('hour.csv'))

  return ori_df

In [32]:
uci_df = getDataFileFromUCI()
uci_df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [33]:
uci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [34]:
# Converting 'dteday' to datetime
uci_df['dteday'] = pd.to_datetime(uci_df['dteday'])

In [35]:
# Creating a new 'datetime' column
uci_df['datetime'] = uci_df.apply(
    lambda row: datetime.combine(row['dteday'].date(), time(hour=int(row['hr']))),
    axis=1
)

In [36]:
# Sorting by datetime
uci_df = uci_df.sort_values(by='datetime').reset_index(drop=True)

In [37]:
def sanity_check_counts_match(full_kaggle_df, uci_df):
    # Filter only rows from full_kaggle_df where 'count' is NOT NaN
    kaggle_with_count = full_kaggle_df[full_kaggle_df['count'].notna()].copy()

    # Ensure both datetime columns are datetime type
    kaggle_with_count['datetime'] = pd.to_datetime(kaggle_with_count['datetime'])
    uci_df['datetime'] = pd.to_datetime(uci_df['datetime'])

    # Merge both on 'datetime'
    merged = pd.merge(
        kaggle_with_count[['datetime', 'count']],
        uci_df[['datetime', 'cnt']],
        on='datetime',
        how='inner'
    )

    # Compare the values
    comparison = merged['count'] == merged['cnt']

    if comparison.all():
        print("✅ All matching datetimes have matching count values!")
    else:
        mismatches = merged[~comparison]
        print(f"❌ {len(mismatches)} mismatches found out of {len(merged)} checked rows.")
        return mismatches  # return the mismatching rows

    return None  # everything matched

In [38]:
def traditional_sanity_check_counts_match(full_kaggle_df, uci_df):
    # Convert datetime just to be sure
    full_kaggle_df['datetime'] = pd.to_datetime(full_kaggle_df['datetime'])
    uci_df['datetime'] = pd.to_datetime(uci_df['datetime'])

    # Filter only rows with known 'count' (i.e., from train set)
    kaggle_with_count = full_kaggle_df[full_kaggle_df['count'].notna()]

    # Create a dictionary from UCI data for fast lookup
    uci_lookup = dict(zip(uci_df['datetime'], uci_df['cnt']))

    mismatches = []
    total_checked = 0

    for _, row in kaggle_with_count.iterrows():
        dt = row['datetime']
        kaggle_count = row['count']
        uci_count = uci_lookup.get(dt)

        if uci_count != kaggle_count:
            mismatches.append((dt, kaggle_count, uci_count))
        total_checked += 1

    if not mismatches:
        print(f"✅ All {total_checked} matching datetimes have matching count values!")
    else:
        print(f"❌ Found {len(mismatches)} mismatches out of {total_checked} rows checked.")
        return mismatches  # List of (datetime, kaggle_count, uci_count)

    return None

In [39]:
check_mismatches = traditional_sanity_check_counts_match(full_kaggle_df, uci_df)

✅ All 10886 matching datetimes have matching count values!


In [40]:
# double sanity check
mismatched_rows = sanity_check_counts_match(full_kaggle_df, uci_df)

✅ All matching datetimes have matching count values!


In [41]:
# Dropping the existing count column
full_kaggle_df = full_kaggle_df.drop(columns=['count'])

# Building a datetime → cnt dictionary from uci_df
cnt_map = dict(zip(uci_df['datetime'], uci_df['cnt']))

# Mapping it to full_kaggle_df
full_kaggle_df['count'] = full_kaggle_df['datetime'].map(cnt_map)

In [42]:
full_kaggle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    17379 non-null  datetime64[ns]
 1   season      17379 non-null  int64         
 2   holiday     17379 non-null  int64         
 3   workingday  17379 non-null  int64         
 4   weather     17379 non-null  int64         
 5   temp        17379 non-null  float64       
 6   atemp       17379 non-null  float64       
 7   humidity    17379 non-null  int64         
 8   windspeed   17379 non-null  float64       
 9   count       17379 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 1.3 MB


In [43]:
full_kaggle_df.head(10)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,1
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0,2
7,2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,0.0,3
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0,8
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0,14


In [44]:
full_kaggle_df.tail(10)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
17369,2012-12-31 14:00:00,1,0,1,2,11.48,13.635,45,15.0013,247
17370,2012-12-31 15:00:00,1,0,1,2,11.48,14.395,45,8.9981,315
17371,2012-12-31 16:00:00,1,0,1,2,10.66,12.88,48,12.998,214
17372,2012-12-31 17:00:00,1,0,1,2,10.66,14.395,48,6.0032,164
17373,2012-12-31 18:00:00,1,0,1,2,10.66,13.635,48,8.9981,122
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,119
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,89
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,90
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,61
17378,2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,49


### Now we shall save this data as various datasets required at points in the pipeline

In [45]:
# copy the full_kaggle_df
file_save_df = full_kaggle_df.copy()

# Set 'datetime' as the index for slicing
file_save_df = file_save_df.set_index('datetime')

# chronological splits by absolute date
test_and_evidently_reference_df  = file_save_df['2012-05-01':'2012-08-31']      # months 17–20 (May–Aug 2012)
evidently_current_df = file_save_df['2012-09-01':'2012-12-31']      # months 21–24 (Sep–Dec 2012)

In [47]:
# Sanity‑check
print("Test:",    test_and_evidently_reference_df.shape)
print("Drift Monitoring Test Set:",   evidently_current_df.shape)
print("Entire Set:", file_save_df.shape)

Test: (2952, 9)
Drift Monitoring Test Set: (2888, 9)
Entire Set: (17379, 9)


In [48]:
# Save the files to the disk
test_and_evidently_reference_df.to_csv('./data/test-and-reference_data.csv', index=True)
evidently_current_df.to_csv('./data/evidently_current_data.csv', index=True)
file_save_df.to_csv('./data/entire_data.csv', index=True)