In [52]:
# Dependencies

import pandas as pd
from pathlib import Path
import numpy as np

from calendar import monthrange

In [None]:
# Create a For loop to go through each monthly data set, clean and merge them into a singular Data frame

# Define an Empty Merged Data frame for Bike share Data

merged_rides = []

# Define ranges for Years of Documents and Months of Documents

years = np.arange(2020,2024,1)
months = np.arange(1,13,1)

# For loop that does through all months in a 3 year space

for y in years:
    
    for m in months:
        
        # Setting File Path for Bike share Data
        
        filepath = Path(f"../data/{y}{'{:02d}'.format(m)}-divvy-tripdata/{y}{'{:02d}'.format(m)}-divvy-tripdata.csv")
        
        # print(filepath)
        
        try:
            
            # read csv
            
            trip_data = pd.read_csv(filepath)
            
            # limit to chicago/ weather applicable data
            
            trip_data['start_lat'] = round(trip_data['start_lat'], 1)
            trip_data['start_lng'] = round(trip_data['start_lng'], 1)
            
            trip_data_chicago = trip_data.loc[(trip_data['start_lat'] == 41.8) 
                                & (trip_data['start_lng'] == -87.6), :]
            
            
            # remove irrelevant data columns
            
            trip_sorted_data = trip_data_chicago[['ride_id', 'started_at', 'ended_at', 'start_lat', 
                                                'start_lng', 'member_casual', 'rideable_type']].copy()
            
            
            # translate started at to Start Date and Start Hour
            
            trip_sorted_data['started_at'] = trip_sorted_data['started_at'].astype('datetime64[s]')
            trip_sorted_data['start_day'] = trip_sorted_data['started_at'].dt.date
            trip_sorted_data['start_hour'] = trip_sorted_data['started_at'].dt.hour
            
            # add new TRIP LENGTH column
            
            trip_sorted_data['started_at'] = pd.to_datetime(trip_sorted_data['started_at'])
            trip_sorted_data['ended_at'] = pd.to_datetime(trip_sorted_data['ended_at'])
            
            trip_sorted_data['trip_length'] = trip_sorted_data['ended_at'] - trip_sorted_data['started_at']
            
            # remove bad data & remove/ reorder Columns
            
            trip_data_clean = trip_sorted_data.loc[trip_sorted_data['trip_length'] > "P0DT0H0M0S", :]
            
            trip_data_clean = trip_data_clean[['ride_id', 'start_day', 'start_hour', 'trip_length', 'start_lat', 
                                                'start_lng', 'member_casual', 'rideable_type']].copy()
            
            # sort by started_at and fix index
            
            trip_data_clean = trip_data_clean.sort_values(['start_day', 'start_hour']).set_index('ride_id').reset_index()
            
            # display(trip_data_clean.head())
            
            merged_rides.append(trip_data_clean)
            
        # Skip Non-Existing files
            
        except FileNotFoundError:
            
            pass
            
            # check which files are skipped
            
            # print(f"an Exception occured at - {y}{'{:02d}'.format(m)}")

combined_bike_df = pd.concat(merged_rides, ignore_index=True)

combined_bike_df

Unnamed: 0,ride_id,start_day,start_hour,trip_length,start_lat,start_lng,member_casual,rideable_type
0,71AFDB68CD4B2F10,2020-04-01,3,0 days 00:08:45,41.8,-87.6,member,docked_bike
1,045670E8C86D60F8,2020-04-01,5,0 days 00:07:21,41.8,-87.6,member,docked_bike
2,E011081FDFFEC179,2020-04-01,6,0 days 00:36:41,41.8,-87.6,casual,docked_bike
3,8AC0C3A4172436F8,2020-04-01,6,0 days 00:04:31,41.8,-87.6,member,docked_bike
4,90B0B0B44D118FCB,2020-04-01,6,0 days 00:04:05,41.8,-87.6,member,docked_bike
...,...,...,...,...,...,...,...,...
1372108,EF158BA81DAF2CDD,2023-11-30,23,0 days 00:04:42,41.8,-87.6,member,electric_bike
1372109,B20C398992F08514,2023-11-30,23,0 days 00:04:52,41.8,-87.6,casual,electric_bike
1372110,C03B2097E3CD74E2,2023-11-30,23,0 days 00:04:49,41.8,-87.6,member,classic_bike
1372111,A904476E89E76A93,2023-11-30,23,0 days 00:06:46,41.8,-87.6,member,electric_bike


In [None]:
# save file

export_merged_filepath = Path('../cleaned_data/202004-cleaned-tripdata.csv')

combined_bike_df.to_csv(export_merged_filepath, index = False)