In [31]:
import pandas as pd
import numpy as np
import helpers
import geopy.distance

In [2]:
helpers.download_bike_data()
helpers.download_covid_data()
helpers.download_weather_data()

All divvy bike files downloaded successfully
Covid data downloaded succesfully
Weather data downloaded succesfully


In [3]:
bikes = helpers.assemble_bike_data()
covid = pd.read_csv('data/covid/covid_data.csv', index_col=0)
weather = pd.read_csv('data/weather/weather_data.csv', index_col=0)

### Bike Data Exploration

In [4]:
bikes['started_at'] = pd.to_datetime(bikes['started_at'])
bikes['ended_at'] = pd.to_datetime(bikes['ended_at'])

In [4]:
bikes.head(1)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8CD5DE2C2B6C4CFC,docked_bike,2020-06-13 23:24:48,2020-06-13 23:36:55,Wilton Ave & Belmont Ave,117,Damen Ave & Clybourn Ave,163.0,41.94018,-87.65304,41.931931,-87.677856,casual


In [5]:
bikes = bikes[['rideable_type', 'started_at','ended_at','start_station_name','end_station_name','member_casual']]
bikes.head(1)

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual
0,docked_bike,2020-06-13 23:24:48,2020-06-13 23:36:55,Wilton Ave & Belmont Ave,Damen Ave & Clybourn Ave,casual


In [6]:
#Percentage of rides without returns
bikes['end_station_name'].isna().sum() / bikes.shape[0]


0.11148484292409247

In [7]:
#Percentage of members vs casual users
bikes['member_casual'].value_counts(normalize=True)

member    0.569854
casual    0.430146
Name: member_casual, dtype: float64

In [8]:
#Types of bicycles used
bikes['rideable_type'].value_counts(normalize=True)

classic_bike     0.408817
electric_bike    0.327861
docked_bike      0.263322
Name: rideable_type, dtype: float64

In [9]:
#Average number of rides per day
pd.to_datetime(bikes['started_at']).dt.date.value_counts().mean()

13646.067307692309

In [22]:
# Ride Duration
bikes['ride_duration'] = bikes['ended_at'] - bikes['started_at']

In [25]:
# Time of day
bikes['time_of_day'] = None
bikes.loc[((bikes['started_at'].dt.hour >= 5) & (bikes['started_at'].dt.hour <= 10)), 'time_of_day'] = 'morning'
bikes.loc[((bikes['started_at'].dt.hour >= 11) & (bikes['started_at'].dt.hour <= 16)), 'time_of_day'] = 'mid_day'
bikes.loc[((bikes['started_at'].dt.hour >= 17) & (bikes['started_at'].dt.hour <= 22)), 'time_of_day'] = 'evening'
bikes.loc[((bikes['started_at'].dt.hour >= 23) | (bikes['started_at'].dt.hour <= 4)), 'time_of_day'] = 'night'
bikes['time_of_day'].value_counts(normalize=True)

mid_day    0.400989
evening    0.358988
morning    0.185245
night      0.054778
Name: time_of_day, dtype: float64

In [33]:
bikes

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,time_of_day
0,8CD5DE2C2B6C4CFC,docked_bike,2020-06-13 23:24:48,2020-06-13 23:36:55,Wilton Ave & Belmont Ave,117,Damen Ave & Clybourn Ave,163.0,41.940180,-87.653040,41.931931,-87.677856,casual,night
1,9A191EB2C751D85D,docked_bike,2020-06-26 07:26:10,2020-06-26 07:31:58,Federal St & Polk St,41,Daley Center Plaza,81.0,41.872077,-87.629543,41.884241,-87.629634,member,morning
2,F37D14B0B5659BCF,docked_bike,2020-06-23 17:12:41,2020-06-23 17:21:14,Daley Center Plaza,81,State St & Harrison St,5.0,41.884241,-87.629634,41.874053,-87.627716,member,evening
3,C41237B506E85FA1,docked_bike,2020-06-20 01:09:35,2020-06-20 01:28:24,Broadway & Cornelia Ave,303,Broadway & Berwyn Ave,294.0,41.945529,-87.646439,41.978353,-87.659753,casual,night
4,4B51B3B0BDA7787C,docked_bike,2020-06-25 16:59:25,2020-06-25 17:08:48,Sheffield Ave & Webster Ave,327,Wilton Ave & Belmont Ave,117.0,41.921540,-87.653818,41.940180,-87.653040,casual,mid_day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9934332,8891BA0053ECEC4F,electric_bike,2022-05-27 22:00:02,2022-05-27 22:07:01,Clark St & Newport St,632,,,41.944557,-87.654830,41.920000,-87.650000,member,evening
9934333,47D8B5FBCADECFC1,electric_bike,2022-05-15 16:05:39,2022-05-15 16:44:12,Clark St & Newport St,632,,,41.944479,-87.654758,41.920000,-87.760000,member,mid_day
9934334,AA8D16CF38B40703,electric_bike,2022-05-21 10:10:13,2022-05-21 10:26:09,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.920000,-87.660000,casual,morning
9934335,897EBFD44F329E0A,electric_bike,2022-05-12 07:53:58,2022-05-12 08:01:18,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.900000,-87.690000,member,morning


In [35]:
# Distance traveled
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.
    FROM: https://stackoverflow.com/questions/43577086/
    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

bikes['km_between_stations'] = haversine(bikes['start_lat'],
                                                             bikes['start_lng'], 
                                                             bikes['end_lat'],
                                                             bikes['end_lng'])

In [36]:
bikes

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,time_of_day,haversine_distance
0,8CD5DE2C2B6C4CFC,docked_bike,2020-06-13 23:24:48,2020-06-13 23:36:55,Wilton Ave & Belmont Ave,117,Damen Ave & Clybourn Ave,163.0,41.940180,-87.653040,41.931931,-87.677856,casual,night,2.248318
1,9A191EB2C751D85D,docked_bike,2020-06-26 07:26:10,2020-06-26 07:31:58,Federal St & Polk St,41,Daley Center Plaza,81.0,41.872077,-87.629543,41.884241,-87.629634,member,morning,1.352596
2,F37D14B0B5659BCF,docked_bike,2020-06-23 17:12:41,2020-06-23 17:21:14,Daley Center Plaza,81,State St & Harrison St,5.0,41.884241,-87.629634,41.874053,-87.627716,member,evening,1.143929
3,C41237B506E85FA1,docked_bike,2020-06-20 01:09:35,2020-06-20 01:28:24,Broadway & Cornelia Ave,303,Broadway & Berwyn Ave,294.0,41.945529,-87.646439,41.978353,-87.659753,casual,night,3.812264
4,4B51B3B0BDA7787C,docked_bike,2020-06-25 16:59:25,2020-06-25 17:08:48,Sheffield Ave & Webster Ave,327,Wilton Ave & Belmont Ave,117.0,41.921540,-87.653818,41.940180,-87.653040,casual,mid_day,2.073672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9934332,8891BA0053ECEC4F,electric_bike,2022-05-27 22:00:02,2022-05-27 22:07:01,Clark St & Newport St,632,,,41.944557,-87.654830,41.920000,-87.650000,member,evening,2.759668
9934333,47D8B5FBCADECFC1,electric_bike,2022-05-15 16:05:39,2022-05-15 16:44:12,Clark St & Newport St,632,,,41.944479,-87.654758,41.920000,-87.760000,member,mid_day,9.121446
9934334,AA8D16CF38B40703,electric_bike,2022-05-21 10:10:13,2022-05-21 10:26:09,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.920000,-87.660000,casual,morning,3.491561
9934335,897EBFD44F329E0A,electric_bike,2022-05-12 07:53:58,2022-05-12 08:01:18,Francisco Ave & Bloomingdale Ave,429,,,41.910000,-87.700000,41.900000,-87.690000,member,morning,1.386112


In [37]:
bikes['haversine_distance']

0          2.248318
1          1.352596
2          1.143929
3          3.812264
4          2.073672
             ...   
9934332    2.759668
9934333    9.121446
9934334    3.491561
9934335    1.386112
9934336    0.644634
Name: haversine_distance, Length: 9934337, dtype: float64

### Weather

In [10]:
weather.reset_index(inplace=True)
weather = weather.rename(columns={'time':'date'})
weather['date'] = pd.to_datetime(weather['date'])
weather.head(1)

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2020-06-01,18.7,12.8,25.0,0.0,0.0,187.0,16.8,,1021.1,


In [11]:
weather.describe()

Unnamed: 0,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
count,730.0,730.0,730.0,730.0,730.0,728.0,730.0,0.0,728.0,0.0
mean,12.124521,7.573288,16.654384,2.569726,18.915068,187.473901,16.110137,,1016.68283,
std,11.049631,10.730799,11.714726,6.688645,73.366423,99.458766,5.574863,,7.059626,
min,-17.5,-21.1,-14.4,0.0,0.0,1.0,3.8,,995.2,
25%,3.225,-0.6,7.2,0.0,0.0,90.0,12.125,,1012.0,
50%,12.1,7.2,17.2,0.0,0.0,207.5,15.6,,1016.1,
75%,22.475,17.8,27.2,1.3,0.0,262.0,19.1,,1021.2,
max,29.7,25.0,35.6,68.3,660.0,360.0,37.9,,1037.6,


### Covid

In [12]:
covid = covid.rename(columns={'Date':'date'})
covid['date'] = pd.to_datetime(covid['date'])
covid.head(1)

Unnamed: 0,date,Cases - Total,Deaths - Total,Hospitalizations - Total,Cases - Age 0-17,Cases - Age 18-29,Cases - Age 30-39,Cases - Age 40-49,Cases - Age 50-59,Cases - Age 60-69,...,Hospitalizations - Age Unknown,Hospitalizations - Female,Hospitalizations - Male,Hospitalizations - Unknown Gender,Hospitalizations - Latinx,Hospitalizations - Asian Non-Latinx,Hospitalizations - Black Non-Latinx,Hospitalizations - White Non-Latinx,Hospitalizations - Other Race Non-Latinx,Hospitalizations - Unknown Race/Ethnicity
0,2021-06-12,46,5,8.0,5,13,13,3,7,1,...,0.0,3.0,5.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0


In [13]:
covid.columns

Index(['date', 'Cases - Total', 'Deaths - Total', 'Hospitalizations - Total',
       'Cases - Age 0-17', 'Cases - Age 18-29', 'Cases - Age 30-39',
       'Cases - Age 40-49', 'Cases - Age 50-59', 'Cases - Age 60-69',
       'Cases - Age 70-79', 'Cases -  Age 80+', 'Cases - Age Unknown',
       'Cases - Female', 'Cases - Male', 'Cases - Unknown Gender',
       'Cases - Latinx', 'Cases - Asian Non-Latinx',
       'Cases - Black Non-Latinx', 'Cases - White Non-Latinx',
       'Cases - Other Race Non-Latinx', 'Cases - Unknown Race/Ethnicity',
       'Deaths - Age 0-17', 'Deaths - Age 18-29', 'Deaths - Age 30-39',
       'Deaths - Age 40-49', 'Deaths - Age 50-59', 'Deaths - Age 60-69',
       'Deaths - Age 70-79', 'Deaths - Age 80+', 'Deaths - Age Unknown',
       'Deaths - Female', 'Deaths - Male', 'Deaths - Unknown Gender',
       'Deaths - Latinx', 'Deaths - Asian Non-Latinx',
       'Deaths - Black Non-Latinx', 'Deaths - White Non-Latinx',
       'Deaths - Other Race Non-Latinx', 'Death

In [14]:
covid.describe()

Unnamed: 0,Cases - Total,Deaths - Total,Hospitalizations - Total,Cases - Age 0-17,Cases - Age 18-29,Cases - Age 30-39,Cases - Age 40-49,Cases - Age 50-59,Cases - Age 60-69,Cases - Age 70-79,...,Hospitalizations - Age Unknown,Hospitalizations - Female,Hospitalizations - Male,Hospitalizations - Unknown Gender,Hospitalizations - Latinx,Hospitalizations - Asian Non-Latinx,Hospitalizations - Black Non-Latinx,Hospitalizations - White Non-Latinx,Hospitalizations - Other Race Non-Latinx,Hospitalizations - Unknown Race/Ethnicity
count,871.0,871.0,866.0,871.0,871.0,871.0,871.0,871.0,871.0,871.0,...,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0
mean,750.339839,8.903559,50.987298,121.770379,175.189437,150.910448,109.235362,87.925373,58.560276,29.198622,...,0.010393,25.431871,25.548499,0.006928,13.332564,1.483834,23.264434,10.415704,1.666282,0.82448
std,1118.255519,10.611851,53.336156,230.690513,268.039626,225.788297,157.068229,125.149419,82.479692,38.948015,...,0.101472,26.052825,27.896391,0.082996,15.73789,2.502317,25.41845,10.677855,2.322186,1.233127
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,255.0,2.0,21.0,30.0,58.5,52.0,35.0,28.0,18.0,9.0,...,0.0,10.25,9.0,0.0,4.0,0.0,8.0,4.0,0.0,0.0
50%,434.0,5.0,31.0,63.0,105.0,88.0,64.0,49.0,32.0,16.0,...,0.0,16.5,15.0,0.0,8.0,1.0,14.0,7.0,1.0,0.0
75%,815.0,12.0,62.0,117.5,186.5,163.5,122.5,101.5,72.0,37.0,...,0.0,31.0,31.0,0.0,16.0,2.0,29.0,13.0,2.0,1.0
max,10433.0,58.0,696.0,2821.0,2303.0,2009.0,1393.0,1110.0,743.0,336.0,...,1.0,338.0,358.0,1.0,219.0,51.0,235.0,150.0,29.0,12.0


### Time Series Table

In [29]:
t_bikes = bikes[['started_at','ride_duration', 'rideable_type','start_station_name','end_station_name','member_casual']].copy()
t_bikes.rename(columns={'started_at':'date'}, inplace=True)
t_bikes['date'] = pd.to_datetime(t_bikes['date']).dt.date

In [28]:
t_bikes

Unnamed: 0,date,rideable_type,ride_duration,start_station_name,end_station_name,member_casual
0,2020-06-13,docked_bike,0 days 00:12:07,Wilton Ave & Belmont Ave,Damen Ave & Clybourn Ave,casual
1,2020-06-26,docked_bike,0 days 00:05:48,Federal St & Polk St,Daley Center Plaza,member
2,2020-06-23,docked_bike,0 days 00:08:33,Daley Center Plaza,State St & Harrison St,member
3,2020-06-20,docked_bike,0 days 00:18:49,Broadway & Cornelia Ave,Broadway & Berwyn Ave,casual
4,2020-06-25,docked_bike,0 days 00:09:23,Sheffield Ave & Webster Ave,Wilton Ave & Belmont Ave,casual
...,...,...,...,...,...,...
9934332,2022-05-27,electric_bike,0 days 00:06:59,Clark St & Newport St,,member
9934333,2022-05-15,electric_bike,0 days 00:38:33,Clark St & Newport St,,member
9934334,2022-05-21,electric_bike,0 days 00:15:56,Francisco Ave & Bloomingdale Ave,,casual
9934335,2022-05-12,electric_bike,0 days 00:07:20,Francisco Ave & Bloomingdale Ave,,member


In [27]:
rides_per_day = t_bikes.groupby(['date']).size().reset_index()
rides_per_day['date'] = pd.to_datetime(rides_per_day['date'])
rides_per_day.rename(columns={0:'rides'}, inplace=True)


In [17]:
time_table = pd.DataFrame(pd.date_range(helpers.START_DATE,helpers.END_DATE, freq='D'),columns=['date'])


In [18]:
time_table.merge(rides_per_day,on='date').merge(weather,on='date').merge(covid[['date','Cases - Total', 'Deaths - Total', 'Hospitalizations - Total']],on='date')

Unnamed: 0,date,rides,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,Cases - Total,Deaths - Total,Hospitalizations - Total
0,2020-06-03,6296,24.6,21.1,29.4,0.3,0.0,311.0,13.5,,1009.0,,280,21,50.0
1,2020-06-04,7628,25.5,20.0,30.0,0.0,0.0,150.0,11.7,,1007.1,,319,15,31.0
2,2020-06-05,8922,26.8,22.2,31.7,0.0,0.0,239.0,12.1,,1009.5,,337,17,42.0
3,2020-06-06,15239,21.6,16.1,24.4,0.0,0.0,25.0,16.2,,1016.5,,170,20,38.0
4,2020-06-07,13224,20.3,15.0,26.1,0.0,0.0,70.0,13.3,,1017.5,,110,20,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,2022-05-27,16357,15.0,11.1,18.3,3.3,0.0,353.0,18.0,,1008.9,,1044,1,28.0
724,2022-05-28,31934,19.0,10.6,24.4,0.0,0.0,189.0,10.2,,1012.0,,603,2,31.0
725,2022-05-29,33377,24.5,18.3,30.6,0.0,0.0,183.0,22.6,,1008.7,,535,0,30.0
726,2022-05-30,30470,27.9,22.8,33.3,0.0,0.0,195.0,28.0,,1009.0,,379,1,28.0
