Import All Data Files

In [1]:
# import packages
#!pip install geopy
import pandas as pd
import numpy as np
import datetime
import re
import geopy.distance

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# set to a value between 0 and 1
data_per_file = 1

# import data
data = pd.DataFrame()
lens = []

# user_path = "\there"
user_path = "\HP\Documents\GitHub"

for m in range(1, 13):
    path = None
    if m < 5:
        path = r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\\20210"+str(m)+"-divvy-tripdata.csv"
    elif (m >= 5) & (m < 10):
        path = r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\\20200"+str(m)+"-divvy-tripdata.csv"
    elif m >= 10:
        path = r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\\2020"+str(m)+"-divvy-tripdata.csv"
    input_data = pd.read_csv(path)
    split_d = input_data.iloc[:int(len(input_data)*data_per_file)]
    data = data.append(split_d, ignore_index=True)

path = r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\\202105-divvy-tripdata.csv"
input_data = pd.read_csv(path)
split_d = input_data.iloc[:int(len(input_data)*data_per_file)]
data = data.append(split_d, ignore_index=True)

weather_data = pd.read_csv(r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\chicago_hourly_weather_data.csv")

Add Additional Features

In [2]:
# takes string as input and outputs a datetime.datetime struct variable
def getFullDate(str):
    vec = re.split(r'\D+', str)
    for i in range(5):
        vec[i] = int(vec[i])
    return datetime.datetime(vec[2], vec[0], vec[1], vec[3], vec[4])

# takes datetime.datetime struct as input and gets hour/minutes as a float
def getHours(date):
    return date.hour + date.minute/60

# takes two sets of latitude and longitude and outputs a distance in miles
def geoDistance(sLat, sLon, eLat, eLon):
    c1 = (sLat, sLon)
    c2 = (eLat, eLon)
    return geopy.distance.distance(c1, c2).miles

# takes two station IDs and latitude/longitudes and outputs the distance between them
def rideDistance(stStId, eStId, sLat, sLon, eLat, eLon):
    if stStId == eStId:
        return 0
    else:
        return geoDistance(sLat, sLon, eLat, eLon)

# converts a datetime.datetime struct to a string for weather data lookup
def weatherString(dt):
    time_strs = [str(dt.month), str(dt.day), str(dt.hour)]
    for t in range(3):
        if len(time_strs[t]) == 1:
            time_strs[t] = "0"+time_strs[t]

    return str(start_dt.year)+"-"+time_strs[0]+"-"+time_strs[1]+"T"+time_strs[2]+":00:00"

In [3]:
# get columns as vectors
membership = data['member_casual']
start_time = data['started_at']
end_time = data['ended_at']
start_lat = data['start_lat']
start_lon = data['start_lng']
start_StId = data['start_station_name']
end_lat = data['end_lat']
end_lon = data['end_lng']
end_StId = data['end_station_name']

duration = np.ndarray(membership.size)
distance = np.ndarray(membership.size)
start_hour = np.ndarray(membership.size)
end_hour = np.ndarray(membership.size)
temp = np.ndarray(membership.size)
precip = np.ndarray(membership.size)
humid = np.ndarray(membership.size)
snow_depth = np.ndarray(membership.size)

# fill out feature vectors
for i in range(membership.size):
    # find the duration between the start and end of the ride as well as the hour the ride started and stopped
    start_dt = getFullDate(start_time[i])
    end_dt = getFullDate(end_time[i])

    dur = end_dt-start_dt
    duration[i] = dur.days*24*60 + dur.seconds/60

    start_hour[i] = getHours(start_dt)
    end_hour[i] = getHours(end_dt)

    # find distance between the starting and ending stations
    distance[i] = rideDistance(start_StId[i], end_StId[i], start_lat[i], start_lon[i], end_lat[i], end_lon[i])
    
    # get strings to compare start time to time in weather_data
    this_weather = weather_data.loc[weather_data['datetime'] == weatherString(start_dt)]

    if this_weather.shape[0] > 1:
        temp[i] = np.mean(this_weather['temp'])
        precip[i] = np.mean(this_weather['precip'])
        humid[i] = np.mean(this_weather['humidity'])
        snow_depth[i] = np.mean(this_weather['snowdepth'])
    else:
        temp[i] = this_weather['temp']
        precip[i] = this_weather['precip']
        humid[i] = this_weather['humidity']
        snow_depth[i] = this_weather['snowdepth']

# add new columns to dataframe
data['duration'] = duration
data['distance'] = distance
data['start_hour'] = start_hour
data['end_hour'] = end_hour
data['temp'] = temp
data['precip'] = precip
data['humid'] = humid
data['snow_depth'] = snow_depth

In [4]:
# write files to csv file
data.to_csv(r"C:\Users"+user_path+"\CS-396-Divvy-Bikes\Data\\all-months-divvy-tripdata-full.csv")