# Data Cleaning

In [1]:
# Dependencies

import pandas as pd
from pathlib import Path
import numpy as np

from calendar import monthrange

## Chicago BikeShare CSV Data Cleaning and Merging

In [4]:
# Create a For loop to go through each monthly data set, clean and merge all Bike Ride Csvs

# Define an Empty Merged Data frame for Bike share Data

merged_rides = []

# Define ranges for Years of Documents and Months of Documents

years = np.arange(2020,2024,1)
months = np.arange(1,13,1)

# For loop that does through all months in a 3 year space

for y in years:
    
    for m in months:
        
        # Setting File Path for Bike share Data
        
        filepath = Path(f"../data/{y}{'{:02d}'.format(m)}-divvy-tripdata/{y}{'{:02d}'.format(m)}-divvy-tripdata.csv")
        
        # print(filepath)
        
        try:
            
            # read csv
            
            trip_data = pd.read_csv(filepath)
            
            # limit to chicago/ weather applicable data
            
            trip_data['start_lat'] = round(trip_data['start_lat'], 1)
            trip_data['start_lng'] = round(trip_data['start_lng'], 1)
            
            trip_data_chicago = trip_data.loc[(trip_data['start_lat'] == 41.7) 
                                & (trip_data['start_lng'] == -87.7), :]
            
            
            # remove irrelevant data columns
            
            trip_sorted_data = trip_data_chicago[['ride_id', 'started_at', 'ended_at', 'start_station_name', 
                                                'member_casual', 'rideable_type']].copy()
            
            
            # translate started at to Start Date and Start Hour
            
            trip_sorted_data['started_at'] = trip_sorted_data['started_at'].astype('datetime64[s]')
            trip_sorted_data['start_day'] = trip_sorted_data['started_at'].dt.date
            trip_sorted_data['start_hour'] = trip_sorted_data['started_at'].dt.hour
            trip_sorted_data['start_month'] = trip_sorted_data['started_at'].dt.month
            trip_sorted_data['start_year'] = trip_sorted_data['started_at'].dt.year
            
            # add new TRIP LENGTH column
            
            trip_sorted_data['started_at'] = pd.to_datetime(trip_sorted_data['started_at'])
            trip_sorted_data['ended_at'] = pd.to_datetime(trip_sorted_data['ended_at'])
            
            trip_sorted_data['trip_length'] = trip_sorted_data['ended_at'] - trip_sorted_data['started_at']
            
            # remove bad data & remove/ reorder Columns
            
            trip_data_clean = trip_sorted_data.loc[trip_sorted_data['trip_length'] > "P0DT0H0M0S", :]
            trip_data_clean = trip_data_clean.dropna(how = 'any')
            
            trip_data_clean = trip_data_clean[['ride_id', 'started_at', 'start_day', 'start_hour', 'start_year', 'trip_length', 
                                                'start_station_name', 'member_casual', 'rideable_type']].copy()
            
            # sort by started_at and fix index
            
            trip_data_clean = trip_data_clean.sort_values(['start_day', 'start_hour']).set_index('ride_id').reset_index()
            
            # display(trip_data_clean.count())
            
            merged_rides.append(trip_data_clean)
            
        # Skip Non-Existing files
            
        except FileNotFoundError:
            
            pass
            
            # check which files are skipped
            
            # print(f"an Exception occured at - {y}{'{:02d}'.format(m)}")



combined_bike_df = pd.concat(merged_rides, ignore_index=True)

combined_bike_df

Unnamed: 0,ride_id,started_at,start_day,start_hour,start_year,trip_length,start_station_name,member_casual,rideable_type
0,64F68C479159B03B,2020-08-13 09:11:37,2020-08-13,9,2020,0 days 00:02:55,Loomis Blvd & 84th St,member,electric_bike
1,A8D69A9C1432F17D,2020-08-13 17:27:09,2020-08-13,17,2020,0 days 02:28:19,Loomis Blvd & 84th St,casual,electric_bike
2,11248E80B456ECCE,2020-08-13 18:47:09,2020-08-13,18,2020,0 days 01:08:05,Loomis Blvd & 84th St,casual,electric_bike
3,00717D3AD70E1587,2020-08-13 18:49:18,2020-08-13,18,2020,0 days 01:06:20,Loomis Blvd & 84th St,casual,electric_bike
4,D1F0BCECDE17C076,2020-08-14 08:16:40,2020-08-14,8,2020,0 days 00:40:55,Loomis Blvd & 84th St,member,electric_bike
...,...,...,...,...,...,...,...,...,...
11286,862A359A2124B650,2023-11-30 11:34:20,2023-11-30,11,2023,0 days 00:06:07,Elizabeth St & 92nd St,member,classic_bike
11287,92F608CF92A47F3C,2023-11-30 16:23:37,2023-11-30,16,2023,0 days 00:14:06,Vincennes Ave & 104th St,member,classic_bike
11288,C9156CB0FDD10CA7,2023-11-30 17:38:27,2023-11-30,17,2023,0 days 00:21:15,Public Rack - Hamilton Ave & 95th St,member,electric_bike
11289,BC400B236C36B758,2023-11-30 18:43:23,2023-11-30,18,2023,0 days 00:25:39,Kedzie Ave & 83rd St,casual,classic_bike


## Chicago Weather CSV Data Cleaning and Merging

In [None]:

# Start Weather Data Data Cleaning

# Setting File Path for Csv

weather_filepath = Path('../data/chicago_weather_data_2020_2023_Celcius.csv')

# Read Csv data

weather_data = pd.read_csv(weather_filepath)

# Convert Started at to DT

weather_data['dt_iso'] = pd.to_datetime(weather_data['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC', utc=True)

# Convert UTC to Chicago time by designating a time zone

weather_data['dt_iso_chicago'] = weather_data['dt_iso'].dt.tz_convert('America/Chicago')

# translate started at into start day and start hour

weather_data['start_day'] = weather_data['dt_iso_chicago'].dt.date
weather_data['start_hour'] = weather_data['dt_iso_chicago'].dt.hour

# remove irrelevant data columns and duplicated data

weather_data_sorted = weather_data[['start_day', 'start_hour', 'temp', 'dew_point', 'feels_like', 
                                'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
                                'clouds_all', 'weather_id', 'weather_main', 'weather_description']].copy()

weather_data_sorted = weather_data_sorted.drop_duplicates(subset=['start_day', 'start_hour'])

weather_data_sorted



## Merge the Data Frames

In [None]:
# Merge the Datasets

result = pd.merge(combined_bike_df, weather_data_sorted, on=['start_day', 'start_hour'], how='inner')

# return1 = result.loc[result['start_hour'] == 1, :]

result

## Export Merged CSV

In [None]:
# save file

export_merged_filepath = Path('../output/merged_weather_bike_data.csv')

result.to_csv(export_merged_filepath, index = False)