# Data Cleaning

In [1]:
# Dependencies

import pandas as pd
from pathlib import Path
import numpy as np

from calendar import monthrange

## Chicago BikeShare CSV Data Cleaning and Merging

In [None]:
# Create a For loop to go through each monthly data set, clean and merge all Bike Ride Csvs

# Define an Empty Merged Data frame for Bike share Data

merged_rides = []

# Define ranges for Years of Documents and Months of Documents

years = np.arange(2020,2024,1)
months = np.arange(1,13,1)

# Function to process datetime.time objects and compute rounded hours (Lovecy)

def convert_and_round(time_obj):
    
    # Extract hours, minutes, and seconds
    
    hours = time_obj.hour
    minutes = time_obj.minute
    seconds = time_obj.second

    # Calculate total hours
    
    total_hours = hours + (minutes / 60) + (seconds / 3600)
    
    # Apply rounding logic
    
    rounded_hour = np.ceil(total_hours) if total_hours > 0.5 else hours

    # Apply rounding logic
    
    return int(rounded_hour)

# For loop that does through all months in a 3 year space

for y in years:
    
    for m in months:
        
        # Setting File Path for Bike share Data
        
        filepath = Path(f"../data/{y}{'{:02d}'.format(m)}-divvy-tripdata/{y}{'{:02d}'.format(m)}-divvy-tripdata.csv")
        
        # print(filepath)
        
        try:
            
            # read csv
            
            trip_data = pd.read_csv(filepath)
            
            # limit to chicago/ weather applicable data
            
            trip_data['start_lat'] = round(trip_data['start_lat'], 1)
            trip_data['start_lng'] = round(trip_data['start_lng'], 1)
            
            trip_data_chicago = trip_data.loc[(trip_data['start_lat'] == 41.9) 
                                & (trip_data['start_lng'] == -87.6), :]
            
            
            # remove irrelevant data columns
            
            trip_sorted_data = trip_data_chicago[['ride_id', 'started_at', 'ended_at', 'start_station_name', 
                                                'member_casual', 'rideable_type']].copy()
            
            
            # translate started at to Start Date and Start Hour
            
            trip_sorted_data['started_at'] = trip_sorted_data['started_at'].astype('datetime64[s]')
            trip_sorted_data['start_day'] = trip_sorted_data['started_at'].dt.date
            trip_sorted_data['start_hour1'] = trip_sorted_data['started_at'].dt.hour 
            trip_sorted_data['start_time'] = trip_sorted_data['started_at'].dt.time 
            trip_sorted_data['start_year'] = trip_sorted_data['started_at'].dt.year
            
            # Apply the time rounding  function to the 'start_time' column (Lovecy)
            
            trip_sorted_data['start_hour'] = trip_sorted_data['start_time'].apply(convert_and_round)
            
            # Account for the irregular 24 in the start hour
            
            if trip_sorted_data['start_hour'] == 24.0:
                
                trip_sorted_data['start_hour'] = 0
                trip_sorted_data['start_day'] += pd.Timedelta(days=1)
            
            # add new TRIP LENGTH column
            
            trip_sorted_data['started_at'] = pd.to_datetime(trip_sorted_data['started_at'])
            trip_sorted_data['ended_at'] = pd.to_datetime(trip_sorted_data['ended_at'])
            
            trip_sorted_data['trip_length'] = trip_sorted_data['ended_at'] - trip_sorted_data['started_at']
            
            # remove bad data & remove/ reorder Columns
            
            trip_data_clean = trip_sorted_data.loc[trip_sorted_data['trip_length'] > "P0DT0H0M0S", :]
            trip_data_clean = trip_data_clean.dropna(how = 'any')
            
            trip_data_clean = trip_data_clean[['ride_id', 'started_at', 'start_day', 'start_hour', 'start_hour1', 'start_year', 'trip_length', 
                                                'start_station_name', 'member_casual', 'rideable_type']].copy()
            
            # sort by started_at and fix index
            
            trip_data_clean = trip_data_clean.sort_values(['start_day', 'start_hour']).set_index('ride_id').reset_index()
            
            # display(trip_data_clean.count())
            
            merged_rides.append(trip_data_clean)
            
        # Skip Non-Existing files
            
        except FileNotFoundError:
            
            pass
            
            # check which files are skipped
            
            # print(f"an Exception occured at - {y}{'{:02d}'.format(m)}")



combined_bike_df = pd.concat(merged_rides, ignore_index=True)

combined_bike_df

Unnamed: 0,ride_id,started_at,start_day,start_hour,start_hour1,start_year,trip_length,start_station_name,member_casual,rideable_type
0,782CEA3C6968D2A6,2020-04-01 00:13:41,2020-04-01,0,0,2020,0 days 00:05:28,Kingsbury St & Erie St,member,docked_bike
1,07F785C9DDA3404C,2020-04-01 00:11:18,2020-04-01,0,0,2020,0 days 00:00:33,Wabash Ave & 9th St,member,docked_bike
2,1FD159E93F7BAFA1,2020-04-01 00:02:35,2020-04-01,0,0,2020,0 days 00:08:10,Wabash Ave & 16th St,member,docked_bike
3,091D47E4F0FC5022,2020-04-01 00:06:44,2020-04-01,0,0,2020,0 days 00:07:17,Mies van der Rohe Way & Chicago Ave,member,docked_bike
4,643593E85E46A45C,2020-04-01 00:13:36,2020-04-01,0,0,2020,0 days 00:05:23,Kingsbury St & Erie St,member,docked_bike
...,...,...,...,...,...,...,...,...,...,...
9813892,6D855DB843848DB3,2023-11-30 23:16:11,2023-11-30,24,23,2023,0 days 00:10:14,Canal St & Adams St,member,classic_bike
9813893,447027EB102601BE,2023-11-30 23:02:56,2023-11-30,24,23,2023,0 days 00:21:13,Lincoln Ave & Fullerton Ave,casual,electric_bike
9813894,993257B9E439A2DD,2023-11-30 23:53:54,2023-11-30,24,23,2023,0 days 00:09:21,Wentworth Ave & Cermak Rd*,member,classic_bike
9813895,9B518D5122FD7D72,2023-11-30 23:53:34,2023-11-30,24,23,2023,0 days 00:09:43,Wentworth Ave & Cermak Rd*,member,classic_bike


## Chicago Weather CSV Data Cleaning and Merging

In [None]:

# Start Weather Data Data Cleaning

# Setting File Path for Csv

weather_filepath = Path('../data/chicago_weather_data_2020_2023_Celcius.csv')

# Read Csv data

weather_data = pd.read_csv(weather_filepath)

# Convert Started at to DT

weather_data['dt_iso'] = pd.to_datetime(weather_data['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC', utc=True)

# Convert UTC to Chicago time by designating a time zone

weather_data['dt_iso_chicago'] = weather_data['dt_iso'].dt.tz_convert('America/Chicago')

# translate started at into start day and start hour

weather_data['start_day'] = weather_data['dt_iso_chicago'].dt.date
weather_data['start_hour'] = weather_data['dt_iso_chicago'].dt.hour

# remove irrelevant data columns and duplicated data

weather_data_sorted = weather_data[['start_day', 'start_hour', 'temp', 'dew_point', 'feels_like', 
                                'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
                                'clouds_all', 'weather_id', 'weather_main', 'weather_description']].copy()

weather_data_sorted = weather_data_sorted.drop_duplicates(subset=['start_day', 'start_hour'])

weather_data_sorted



## Merge the Data Frames

In [None]:
# Merge the Datasets

result = pd.merge(combined_bike_df, weather_data_sorted, on=['start_day', 'start_hour'], how='inner')

# return1 = result.loc[result['start_hour'] == 1, :]

result

## Export Merged CSV

In [None]:
# save file

export_merged_filepath = Path('../output/merged_weather_bike_data.csv')

result.to_csv(export_merged_filepath, index = False)