In [17]:
# Dependencies

import pandas as pd
from pathlib import Path
import numpy as np

from calendar import monthrange

In [18]:
# Create a For loop to go through each monthly data set, clean and merge all Bike Ride Csvs

# Define an Empty Merged Data frame for Bike share Data

merged_rides = []

# Define ranges for Years of Documents and Months of Documents

years = np.arange(2020,2024,1)
months = np.arange(1,13,1)

# For loop that does through all months in a 3 year space

for y in years:
    
    for m in months:
        
        # Setting File Path for Bike share Data
        
        filepath = Path(f"../data/{y}{'{:02d}'.format(m)}-divvy-tripdata/{y}{'{:02d}'.format(m)}-divvy-tripdata.csv")
        
        # print(filepath)
        
        try:
            
            # read csv
            
            trip_data = pd.read_csv(filepath)
            
            # limit to chicago/ weather applicable data
            
            trip_data['start_lat'] = round(trip_data['start_lat'], 1)
            trip_data['start_lng'] = round(trip_data['start_lng'], 1)
            
            trip_data_chicago = trip_data.loc[(trip_data['start_lat'] == 41.8) 
                                & (trip_data['start_lng'] == -87.6), :]
            
            
            # remove irrelevant data columns
            
            trip_sorted_data = trip_data_chicago[['ride_id', 'started_at', 'ended_at', 'start_station_name', 
                                                'member_casual', 'rideable_type']].copy()
            
            
            # translate started at to Start Date and Start Hour
            
            trip_sorted_data['started_at'] = trip_sorted_data['started_at'].astype('datetime64[s]')
            trip_sorted_data['start_day'] = trip_sorted_data['started_at'].dt.date
            trip_sorted_data['start_hour'] = trip_sorted_data['started_at'].dt.hour
            
            # add new TRIP LENGTH column
            
            trip_sorted_data['started_at'] = pd.to_datetime(trip_sorted_data['started_at'])
            trip_sorted_data['ended_at'] = pd.to_datetime(trip_sorted_data['ended_at'])
            
            trip_sorted_data['trip_length'] = trip_sorted_data['ended_at'] - trip_sorted_data['started_at']
            
            # remove bad data & remove/ reorder Columns
            
            trip_data_clean = trip_sorted_data.loc[trip_sorted_data['trip_length'] > "P0DT0H0M0S", :]
            trip_data_clean = trip_data_clean.dropna(how = 'any')
            
            trip_data_clean = trip_data_clean[['ride_id', 'start_day', 'start_hour', 'trip_length', 
                                                'start_station_name', 'member_casual', 'rideable_type']].copy()
            
            # sort by started_at and fix index
            
            trip_data_clean = trip_data_clean.sort_values(['start_day', 'start_hour']).set_index('ride_id').reset_index()
            
            # display(trip_data_clean.count())
            
            merged_rides.append(trip_data_clean)
            
        # Skip Non-Existing files
            
        except FileNotFoundError:
            
            pass
            
            # check which files are skipped
            
            # print(f"an Exception occured at - {y}{'{:02d}'.format(m)}")



combined_bike_df = pd.concat(merged_rides, ignore_index=True)

combined_bike_df

Unnamed: 0,ride_id,start_day,start_hour,trip_length,start_station_name,member_casual,rideable_type
0,71AFDB68CD4B2F10,2020-04-01,3,0 days 00:08:45,Emerald Ave & 28th St,member,docked_bike
1,045670E8C86D60F8,2020-04-01,5,0 days 00:07:21,Kimbark Ave & 53rd St,member,docked_bike
2,E011081FDFFEC179,2020-04-01,6,0 days 00:36:41,Halsted St & 37th St,casual,docked_bike
3,8AC0C3A4172436F8,2020-04-01,6,0 days 00:04:31,Harper Ave & 59th St,member,docked_bike
4,90B0B0B44D118FCB,2020-04-01,6,0 days 00:04:05,State St & 29th St,member,docked_bike
...,...,...,...,...,...,...,...
1044713,8BE659437F5FEE5D,2023-11-30,23,0 days 00:06:29,Ellis Ave & 55th St,casual,electric_bike
1044714,B20C398992F08514,2023-11-30,23,0 days 00:04:52,Kimbark Ave & 53rd St,casual,electric_bike
1044715,C03B2097E3CD74E2,2023-11-30,23,0 days 00:04:49,MLK Jr Dr & 56th St,member,classic_bike
1044716,A904476E89E76A93,2023-11-30,23,0 days 00:06:46,MLK Jr Dr & 29th St,member,electric_bike


In [19]:
# Start Weather Data Data Cleaning

# Setting File Path for Csv

weather_filepath = Path('../data/chicago_weather_data_2020_2023_Celcius.csv')

# Read and display Csv data

weather_data = pd.read_csv(weather_filepath)

# translate started at 

weather_data['dt_iso'] = pd.to_datetime(weather_data['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC', utc=True)
weather_data['start_day'] = weather_data['dt_iso'].dt.date
weather_data['start_hour'] = weather_data['dt_iso'].dt.hour

# remove irrelevant data columns and duplicated data

weather_data_sorted = weather_data[['start_day', 'start_hour', 'temp', 'dew_point', 'feels_like', 
                                'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
                                'clouds_all', 'weather_id', 'weather_main', 'weather_description']].copy()

weather_data_sorted = weather_data_sorted.drop_duplicates(subset=['start_day', 'start_hour'])

weather_data_sorted



Unnamed: 0,start_day,start_hour,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description
0,2020-01-01,0,-2.08,-5.63,-9.08,-2.31,-1.56,1010,74,12.90,260,95,804,Clouds,overcast clouds
1,2020-01-01,1,-1.99,-5.23,-8.99,-2.23,-1.54,1010,76,10.30,270,75,803,Clouds,broken clouds
2,2020-01-01,2,-1.87,-4.96,-8.87,-2.23,-1.36,1011,77,9.80,270,100,804,Clouds,overcast clouds
3,2020-01-01,3,-1.90,-4.55,-8.90,-2.79,-1.14,1011,80,9.30,270,40,802,Clouds,scattered clouds
4,2020-01-01,4,-2.18,-4.67,-9.18,-2.78,-1.36,1011,81,8.80,260,75,803,Clouds,broken clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37232,2023-12-31,19,0.72,-1.19,-5.23,0.00,2.22,1014,86,7.60,290,100,600,Snow,light snow
37234,2023-12-31,20,0.63,-1.70,-5.17,0.00,2.02,1014,83,7.15,291,100,600,Snow,light snow
37236,2023-12-31,21,0.88,-1.19,-4.85,0.53,2.02,1015,85,7.15,289,100,600,Snow,light snow
37238,2023-12-31,22,0.77,-1.15,-4.38,-0.03,2.22,1016,86,5.81,287,100,600,Snow,light snow


In [20]:
# Merge the Datasets

result = pd.merge(combined_bike_df, weather_data_sorted, on=['start_day', 'start_hour'], how='inner')

# return1 = result.loc[result['start_hour'] == 1, :]

result

Unnamed: 0,ride_id,start_day,start_hour,trip_length,start_station_name,member_casual,rideable_type,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description
0,71AFDB68CD4B2F10,2020-04-01,3,0 days 00:08:45,Emerald Ave & 28th St,member,docked_bike,3.24,1.13,-2.18,2.77,4.02,1018,86,8.20,360,100,804,Clouds,overcast clouds
1,045670E8C86D60F8,2020-04-01,5,0 days 00:07:21,Kimbark Ave & 53rd St,member,docked_bike,3.27,1.16,-1.59,2.77,4.02,1018,86,6.70,350,100,804,Clouds,overcast clouds
2,E011081FDFFEC179,2020-04-01,6,0 days 00:36:41,Halsted St & 37th St,casual,docked_bike,3.27,0.99,-1.97,2.77,4.02,1018,85,7.70,340,100,804,Clouds,overcast clouds
3,8AC0C3A4172436F8,2020-04-01,6,0 days 00:04:31,Harper Ave & 59th St,member,docked_bike,3.27,0.99,-1.97,2.77,4.02,1018,85,7.70,340,100,804,Clouds,overcast clouds
4,90B0B0B44D118FCB,2020-04-01,6,0 days 00:04:05,State St & 29th St,member,docked_bike,3.27,0.99,-1.97,2.77,4.02,1018,85,7.70,340,100,804,Clouds,overcast clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044713,8BE659437F5FEE5D,2023-11-30,23,0 days 00:06:29,Ellis Ave & 55th St,casual,electric_bike,8.41,5.52,4.99,7.75,8.83,1010,82,6.71,191,94,804,Clouds,overcast clouds
1044714,B20C398992F08514,2023-11-30,23,0 days 00:04:52,Kimbark Ave & 53rd St,casual,electric_bike,8.41,5.52,4.99,7.75,8.83,1010,82,6.71,191,94,804,Clouds,overcast clouds
1044715,C03B2097E3CD74E2,2023-11-30,23,0 days 00:04:49,MLK Jr Dr & 56th St,member,classic_bike,8.41,5.52,4.99,7.75,8.83,1010,82,6.71,191,94,804,Clouds,overcast clouds
1044716,A904476E89E76A93,2023-11-30,23,0 days 00:06:46,MLK Jr Dr & 29th St,member,electric_bike,8.41,5.52,4.99,7.75,8.83,1010,82,6.71,191,94,804,Clouds,overcast clouds


In [21]:
# save file

export_merged_filepath = Path('../output/merged_weather_bike_data.csv')

result.to_csv(export_merged_filepath, index = False)