In [1]:
import pandas as pd

df1 = pd.read_csv('Divvy 2020 merged nostation.csv')

  df1 = pd.read_csv('Divvy 2020 merged nostation.csv')


# Data Cleaning for 2020-2022

# 2020 Seperate

In [2]:
df1.dtypes

Unnamed: 0             int64
ride_id               object
rideable_type         object
start_time            object
end_time              object
from_station_name     object
from_station_id       object
to_station_name       object
end_station_id        object
start_lat            float64
start_lng            float64
end_lat              float64
end_lng              float64
usertype              object
dtype: object

In [3]:
df1.isnull().sum()

Unnamed: 0                0
ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name     94656
from_station_id       95282
to_station_name      110881
end_station_id       111342
start_lat                 0
start_lng                 0
end_lat                4255
end_lng                4255
usertype                  0
dtype: int64

In [4]:
# in order to fix our station name/id situation, we need to use start_lat and start_lng values, so I need to investigate where
# the nulls of end_lat and end_lng are so that I can either delete them (in cases if both are null) or retain them (to salvage)

null_lats = list(df1[df1['end_lat'].isnull()].index)
null_lngs = list(df1[df1['end_lng'].isnull()].index)

In [5]:
# Finding out if all of the index positions overlap
set(df1[df1['end_lat'].isnull()].index == df1[df1['end_lng'].isnull()].index)

{True}

In [6]:
# Because all of the index positions overlap, this means we can simply delete these rows by using only one condition
df1.drop(df1[df1['end_lat'].isnull()].index, inplace=True)

In [7]:
df1.isnull().sum()

Unnamed: 0                0
ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name     94656
from_station_id       95282
to_station_name      106626
end_station_id       107087
start_lat                 0
start_lng                 0
end_lat                   0
end_lng                   0
usertype                  0
dtype: int64

In [8]:
display(df1)

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member
1,1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member
2,2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member
3,3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member
4,4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541678,3541678,4631EE956BCEA738,classic_bike,2020-12-19 13:59:33,2020-12-19 14:03:21,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836208,-87.613533,41.838842,-87.621857,casual
3541679,3541679,D17CBEDEA8DBDFF6,electric_bike,2020-12-20 15:48:41,2020-12-20 15:52:14,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836723,-87.613365,41.838723,-87.621854,member
3541680,3541680,447A6C67E9AF962E,docked_bike,2020-12-02 16:59:58,2020-12-02 17:08:28,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836208,-87.613533,41.838842,-87.621857,member
3541681,3541681,F558C17E95751C62,electric_bike,2020-12-20 18:06:52,2020-12-20 18:09:53,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836734,-87.613394,41.838749,-87.621862,member


In [9]:
df1 = df1.drop('Unnamed: 0',axis=1)
df1 = df1.drop_duplicates()
df1 = df1.drop_duplicates(subset=['ride_id']).reset_index()

In [10]:
# In previous iterations, I tried to salvage values. Doing this requires hardware I don't have access to and knowledge that I
# don't currently have access to. The biggest challenge is determining what to do with overlaps, and how likely it is that
# one station is a better pick than another over the course of a ride. I'm sure that if I had a month to take the time to figure
# this out, as well as access to geo-location searches and a hyper-fast computer beyond the realms of home PC enthusiast level,
# that I could design a system to fill in the blanks. However, for the sake of expediency, I will simply delete the values
# that are missing both station_name and station_id

In [11]:
df1

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member
1,1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member
2,2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member
3,3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member
4,4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3537215,3541678,4631EE956BCEA738,classic_bike,2020-12-19 13:59:33,2020-12-19 14:03:21,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836208,-87.613533,41.838842,-87.621857,casual
3537216,3541679,D17CBEDEA8DBDFF6,electric_bike,2020-12-20 15:48:41,2020-12-20 15:52:14,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836723,-87.613365,41.838723,-87.621854,member
3537217,3541680,447A6C67E9AF962E,docked_bike,2020-12-02 16:59:58,2020-12-02 17:08:28,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836208,-87.613533,41.838842,-87.621857,member
3537218,3541681,F558C17E95751C62,electric_bike,2020-12-20 18:06:52,2020-12-20 18:09:53,Rhodes Ave & 32nd St,13215,Indiana Ave & 31st St,TA1308000036,41.836734,-87.613394,41.838749,-87.621862,member


In [12]:
null_from_names = df1.iloc[df1[df1['from_station_name'].isnull()].index]
null_from_id = df1.iloc[df1[df1['from_station_id'].isnull()].index]

In [13]:
# This function drops rows with nulls from the subset if more than one column has a null. In this case, we only have two columns
# and this means that if both are null, the value is dropped, however if only one is null, the row is not dropped
df1 = df1.dropna(subset=['from_station_name', 'from_station_id'], thresh=1)

# Will do the same for to_station_name and end_station_id
df1 = df1.dropna(subset=['to_station_name', 'end_station_id'], thresh=1)

In [14]:
df1.isnull().sum()

index                  0
ride_id                0
rideable_type          0
start_time             0
end_time               0
from_station_name      0
from_station_id      524
to_station_name        0
end_station_id       404
start_lat              0
start_lng              0
end_lat                0
end_lng                0
usertype               0
dtype: int64

In [15]:
df1[df1['from_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1182613,1183690,248BC2E2842C3468,electric_bike,2020-07-30 16:39:55,2020-07-30 16:40:33,hubbard_test_lws,,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,41.89,-87.68,41.889975,-87.680321,casual
1607481,1609140,87604F9E0202C24C,electric_bike,2020-08-28 08:29:35,2020-08-28 09:18:49,W Oakdale Ave & N Broadway,,W Oakdale Ave & N Broadway,,41.94,-87.64,41.940000,-87.640000,casual
1607854,1609513,7E1268F06DB1E8ED,electric_bike,2020-08-22 14:25:05,2020-08-22 14:32:07,W Oakdale Ave & N Broadway,,Clark St & Wrightwood Ave,340.0,41.94,-87.64,41.929563,-87.643578,casual
1609231,1610890,ED4D2E4B2B836445,electric_bike,2020-08-22 14:25:28,2020-08-22 14:32:10,W Oakdale Ave & N Broadway,,Clark St & Wrightwood Ave,340.0,41.94,-87.64,41.929554,-87.643613,casual
1609244,1610903,54A93E5423F9B0E4,electric_bike,2020-08-22 14:17:21,2020-08-22 14:25:15,W Oakdale Ave & N Broadway,,Racine Ave & Belmont Ave,226.0,41.94,-87.64,41.939654,-87.658884,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3364198,3368284,D6DE4B1BDBB42009,electric_bike,2020-11-08 13:27:09,2020-11-08 13:43:26,W Oakdale Ave & N Broadway,,Damen Ave & Clybourn Ave,163.0,41.94,-87.64,41.931882,-87.677793,member
3364199,3368285,A781FD09251C80CD,electric_bike,2020-11-08 13:26:28,2020-11-08 13:26:44,W Oakdale Ave & N Broadway,,W Oakdale Ave & N Broadway,,41.94,-87.64,41.940000,-87.640000,member
3375393,3379500,DAE7E54F9A80C7CF,electric_bike,2020-11-13 10:43:28,2020-11-13 10:50:25,W Oakdale Ave & N Broadway,,Southport Ave & Roscoe St,229.0,41.94,-87.64,41.943624,-87.664002,member
3383613,3387736,84E9039AD0B2F387,electric_bike,2020-11-06 17:11:02,2020-11-06 17:20:38,W Oakdale Ave & N Broadway,,Clark St & North Ave,126.0,41.94,-87.64,41.912034,-87.631915,member


In [16]:
# fixing one station at a time from here out
df1.loc[df1['from_station_name'] == 'W Oakdale Ave & N Broadway', 'from_station_id'] = '20252.0'

In [17]:
df1[df1['from_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1182613,1183690,248BC2E2842C3468,electric_bike,2020-07-30 16:39:55,2020-07-30 16:40:33,hubbard_test_lws,,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,41.89,-87.68,41.889975,-87.680321,casual
1613025,1614684,4D9CCC0EDC37A8CD,electric_bike,2020-08-23 15:34:56,2020-08-23 16:14:42,W Armitage Ave & N Sheffield Ave,,Calumet Ave & 18th St,338.0,41.92,-87.65,41.857656,-87.619272,casual
1615588,1617247,74F7C9FB57E3C1B2,electric_bike,2020-08-30 06:19:38,2020-08-30 06:21:01,W Armitage Ave & N Sheffield Ave,,W Armitage Ave & N Sheffield Ave,,41.92,-87.65,41.920000,-87.650000,casual
1617728,1619387,B12C0A27D55DC96F,electric_bike,2020-08-13 16:32:55,2020-08-13 16:33:08,W Armitage Ave & N Sheffield Ave,,W Armitage Ave & N Sheffield Ave,,41.92,-87.65,41.920000,-87.650000,casual
1619334,1620993,A69C76CF7E0B60F0,electric_bike,2020-08-30 13:09:44,2020-08-30 13:15:03,W Armitage Ave & N Sheffield Ave,,Clifton Ave & Armitage Ave,223.0,41.92,-87.65,41.918184,-87.657018,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160709,3164569,083B47675BFA7372,electric_bike,2020-11-07 18:51:09,2020-11-07 18:56:12,W Armitage Ave & N Sheffield Ave,,Halsted St & Wrightwood Ave,349.0,41.92,-87.65,41.929150,-87.649206,casual
3160906,3164766,540A1AA801E745FA,electric_bike,2020-11-11 17:45:43,2020-11-11 18:10:55,W Armitage Ave & N Sheffield Ave,,Daley Center Plaza,81.0,41.92,-87.65,41.884172,-87.629186,casual
3202062,3205934,049F2BBABEACA493,electric_bike,2020-11-06 07:42:48,2020-11-06 07:51:57,W Armitage Ave & N Sheffield Ave,,Lake Shore Dr & Diversey Pkwy,329.0,41.92,-87.65,41.932834,-87.636649,member
3223086,3226988,A25B7D5B317983C5,electric_bike,2020-11-27 10:19:38,2020-11-27 10:26:39,W Armitage Ave & N Sheffield Ave,,Southport Ave & Wrightwood Ave,190.0,41.92,-87.65,41.928880,-87.663797,member


In [18]:
df1[df1['from_station_name'] == 'W Armitage Ave & N Sheffield Ave']

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1613025,1614684,4D9CCC0EDC37A8CD,electric_bike,2020-08-23 15:34:56,2020-08-23 16:14:42,W Armitage Ave & N Sheffield Ave,,Calumet Ave & 18th St,338.0,41.92,-87.65,41.857656,-87.619272,casual
1615588,1617247,74F7C9FB57E3C1B2,electric_bike,2020-08-30 06:19:38,2020-08-30 06:21:01,W Armitage Ave & N Sheffield Ave,,W Armitage Ave & N Sheffield Ave,,41.92,-87.65,41.920000,-87.650000,casual
1617728,1619387,B12C0A27D55DC96F,electric_bike,2020-08-13 16:32:55,2020-08-13 16:33:08,W Armitage Ave & N Sheffield Ave,,W Armitage Ave & N Sheffield Ave,,41.92,-87.65,41.920000,-87.650000,casual
1619334,1620993,A69C76CF7E0B60F0,electric_bike,2020-08-30 13:09:44,2020-08-30 13:15:03,W Armitage Ave & N Sheffield Ave,,Clifton Ave & Armitage Ave,223.0,41.92,-87.65,41.918184,-87.657018,casual
1620664,1622323,69320923D1AE603A,electric_bike,2020-08-26 22:00:47,2020-08-26 22:34:16,W Armitage Ave & N Sheffield Ave,,Wells St & Polk St,175.0,41.92,-87.65,41.872297,-87.633103,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3461827,3466088,724CAABCBF093487,electric_bike,2020-12-05 16:58:06,2020-12-05 17:07:57,W Armitage Ave & N Sheffield Ave,20254.0,Damen Ave & Cortland St,13133,41.92,-87.65,41.915979,-87.677132,member
3471347,3475623,D98E43B296F48790,electric_bike,2020-12-05 17:34:50,2020-12-05 17:47:31,W Armitage Ave & N Sheffield Ave,20254.0,Wells St & Huron St,TA1306000012,41.92,-87.65,41.894995,-87.634321,member
3478367,3482656,F0F76E1C64D69679,electric_bike,2020-12-14 13:50:19,2020-12-14 13:53:45,W Armitage Ave & N Sheffield Ave,20254.0,Sheffield Ave & Fullerton Ave,TA1306000016,41.92,-87.65,41.925507,-87.653717,member
3504177,3508507,EAE1F6134BE03E6C,electric_bike,2020-12-08 09:18:34,2020-12-08 09:33:04,W Armitage Ave & N Sheffield Ave,20254.0,LaSalle Dr & Huron St,KP1705001026,41.92,-87.65,41.895034,-87.632505,casual


In [19]:
df1.loc[df1['from_station_name'] == 'W Armitage Ave & N Sheffield Ave', 'from_station_id'] = '20254.0'

In [20]:
df1[df1['from_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1182613,1183690,248BC2E2842C3468,electric_bike,2020-07-30 16:39:55,2020-07-30 16:40:33,hubbard_test_lws,,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,41.89,-87.68,41.889975,-87.680321,casual


In [21]:
df1[df1['from_station_name'] == 'hubbard_test_lws']

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1182613,1183690,248BC2E2842C3468,electric_bike,2020-07-30 16:39:55,2020-07-30 16:40:33,hubbard_test_lws,,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,41.89,-87.68,41.889975,-87.680321,casual


In [22]:
# Now that we have observed that there are values that include "test", we can assume that we should remove these values from the
# dataframe. I will go through all of them at the end to determine this. For now, I will delete this row.

In [23]:
df1 = df1.drop(1182613)

In [24]:
df1.isnull().sum()

index                  0
ride_id                0
rideable_type          0
start_time             0
end_time               0
from_station_name      0
from_station_id        0
to_station_name        0
end_station_id       404
start_lat              0
start_lng              0
end_lat                0
end_lng                0
usertype               0
dtype: int64

In [25]:
# now to investigate end_station_id


In [26]:
df1[df1['end_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1605826,1607485,427ED3D8BD480E4B,electric_bike,2020-08-29 23:02:04,2020-08-29 23:25:43,Broadway & Granville Ave,454.0,W Oakdale Ave & N Broadway,,41.994839,-87.660244,41.94,-87.64,casual
1605873,1607532,F3A221B47D3A479A,electric_bike,2020-08-29 22:31:43,2020-08-29 22:41:58,Pine Grove Ave & Waveland Ave,232.0,W Oakdale Ave & N Broadway,,41.949315,-87.646370,41.94,-87.64,casual
1606829,1608488,082313F9D5DC4A13,electric_bike,2020-08-31 15:50:27,2020-08-31 16:04:18,Clark St & Armitage Ave,94.0,W Oakdale Ave & N Broadway,,41.918296,-87.636336,41.94,-87.64,casual
1606874,1608533,FDE9F1A39375EE33,electric_bike,2020-08-27 09:27:10,2020-08-27 09:48:31,Wells St & Hubbard St,212.0,W Oakdale Ave & N Broadway,,41.889840,-87.634320,41.94,-87.64,casual
1607078,1608737,C6D3109E1569A3B8,electric_bike,2020-08-16 16:15:09,2020-08-16 16:22:48,Clark St & Armitage Ave,94.0,W Oakdale Ave & N Broadway,,41.918386,-87.636411,41.94,-87.64,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3364199,3368285,A781FD09251C80CD,electric_bike,2020-11-08 13:26:28,2020-11-08 13:26:44,W Oakdale Ave & N Broadway,20252.0,W Oakdale Ave & N Broadway,,41.940000,-87.640000,41.94,-87.64,member
3371194,3375289,008955EFCA1CF51B,electric_bike,2020-11-17 15:00:11,2020-11-17 15:06:11,Lakeview Ave & Fullerton Pkwy,313.0,W Oakdale Ave & N Broadway,,41.925879,-87.639162,41.94,-87.64,member
3375401,3379508,9EEAB641C0FC8A7C,electric_bike,2020-11-28 15:27:53,2020-11-28 15:33:57,Racine Ave & Belmont Ave,226.0,W Oakdale Ave & N Broadway,,41.939675,-87.658842,41.94,-87.64,member
3379022,3383137,273D3DFFDA195C7C,electric_bike,2020-11-04 12:38:16,2020-11-04 12:42:09,Lake Shore Dr & Belmont Ave,334.0,W Oakdale Ave & N Broadway,,41.940687,-87.639184,41.94,-87.64,member


In [27]:
df1.loc[df1['to_station_name'] == 'W Oakdale Ave & N Broadway', 'end_station_id'] = '20252.0'

In [28]:
df1[df1['end_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
1608987,1610646,FEE5F6365A9B42AE,electric_bike,2020-08-15 09:48:44,2020-08-15 10:03:32,Clark St & Elm St,176.0,W Armitage Ave & N Sheffield Ave,,41.902854,-87.631760,41.92,-87.65,casual
1610104,1611763,9BA05D744F2430B6,electric_bike,2020-08-17 11:57:07,2020-08-17 12:22:25,Green St & Madison St,198.0,W Armitage Ave & N Sheffield Ave,,41.882013,-87.648975,41.92,-87.65,casual
1615588,1617247,74F7C9FB57E3C1B2,electric_bike,2020-08-30 06:19:38,2020-08-30 06:21:01,W Armitage Ave & N Sheffield Ave,20254.0,W Armitage Ave & N Sheffield Ave,,41.920000,-87.650000,41.92,-87.65,casual
1617518,1619177,BF0DFA178923F1EB,electric_bike,2020-08-26 18:44:47,2020-08-26 19:35:36,Kimbark Ave & 53rd St,322.0,W Armitage Ave & N Sheffield Ave,,41.799625,-87.594797,41.92,-87.65,casual
1617728,1619387,B12C0A27D55DC96F,electric_bike,2020-08-13 16:32:55,2020-08-13 16:33:08,W Armitage Ave & N Sheffield Ave,20254.0,W Armitage Ave & N Sheffield Ave,,41.920000,-87.650000,41.92,-87.65,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2940057,2943596,F5344999C2763FE4,electric_bike,2020-10-18 09:50:31,2020-10-18 09:50:42,W Armitage Ave & N Sheffield Ave,20254.0,W Armitage Ave & N Sheffield Ave,,41.920000,-87.650000,41.92,-87.65,member
2940364,2943904,367FD84357CEC604,electric_bike,2020-10-08 19:32:27,2020-10-08 19:41:10,Sheffield Ave & Wellington Ave,115.0,W Armitage Ave & N Sheffield Ave,,41.936356,-87.652900,41.92,-87.65,member
2951290,2954840,A7BB79F54B4F619A,electric_bike,2020-10-02 16:37:37,2020-10-02 16:50:54,Kingsbury St & Erie St,74.0,W Armitage Ave & N Sheffield Ave,,41.894019,-87.641917,41.92,-87.65,member
3035179,3038862,49F6DEE3FD2BC50B,electric_bike,2020-10-13 17:03:07,2020-10-13 17:19:12,Clark St & Newport St,632.0,W Armitage Ave & N Sheffield Ave,,41.944564,-87.654732,41.92,-87.65,member


In [29]:
df1.loc[df1['to_station_name'] == 'W Armitage Ave & N Sheffield Ave', 'end_station_id'] = '20254.0'

In [30]:
df1[df1['end_station_id'].isnull()]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype


# 2021 Seperate

In [31]:
# Now to rinse and repeat with the other datasets.

In [32]:
df2 = pd.read_csv('Divvy 2021 merged nostation.csv')

In [33]:
df2

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,0,E19E6F1B8D4C42ED,electric_bike,2021-01-23 16:14:19,2021-01-23 16:24:44,California Ave & Cortez St,17660,,,41.900341,-87.696743,41.890000,-87.720000,member
1,1,DC88F20C2C55F27F,electric_bike,2021-01-27 18:43:08,2021-01-27 18:47:12,California Ave & Cortez St,17660,,,41.900333,-87.696707,41.900000,-87.690000,member
2,2,EC45C94683FE3F27,electric_bike,2021-01-21 22:35:54,2021-01-21 22:37:14,California Ave & Cortez St,17660,,,41.900313,-87.696643,41.900000,-87.700000,member
3,3,4FA453A75AE377DB,electric_bike,2021-01-07 13:31:13,2021-01-07 13:42:55,California Ave & Cortez St,17660,,,41.900399,-87.696662,41.920000,-87.690000,member
4,4,BE5E8EB4E7263A0B,electric_bike,2021-01-23 02:24:02,2021-01-23 02:24:45,California Ave & Cortez St,17660,,,41.900326,-87.696697,41.900000,-87.700000,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595058,5595058,847431F3D5353AB7,electric_bike,2021-12-12 13:36:55,2021-12-12 13:56:08,Canal St & Madison St,13341,,,41.882289,-87.639752,41.890000,-87.610000,casual
5595059,5595059,CF407BBC3B9FAD63,electric_bike,2021-12-06 19:37:50,2021-12-06 19:44:51,Canal St & Madison St,13341,Kingsbury St & Kinzie St,KA1503000043,41.882123,-87.640053,41.889106,-87.638862,member
5595060,5595060,60BB69EBF5440E92,electric_bike,2021-12-02 08:57:04,2021-12-02 09:05:21,Canal St & Madison St,13341,Dearborn St & Monroe St,TA1305000006,41.881956,-87.639955,41.880254,-87.629603,member
5595061,5595061,C414F654A28635B8,electric_bike,2021-12-13 09:00:26,2021-12-13 09:14:39,Lawndale Ave & 16th St,362.0,,,41.860000,-87.720000,41.850000,-87.710000,member


In [34]:
df2.isnull().sum()

Unnamed: 0                0
ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name    690809
from_station_id      690806
to_station_name      739170
end_station_id       739170
start_lat                 0
start_lng                 0
end_lat                4771
end_lng                4771
usertype                  0
dtype: int64

In [35]:
null_lats2 = list(df2[df2['end_lat'].isnull()].index)
null_lngs2 = list(df2[df2['end_lng'].isnull()].index)
set(df2[df2['end_lat'].isnull()].index == df2[df2['end_lng'].isnull()].index)

{True}

In [36]:
df2.drop(df2[df2['end_lat'].isnull()].index, inplace=True)

In [37]:
df2.isnull().sum()

Unnamed: 0                0
ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name    690809
from_station_id      690806
to_station_name      734399
end_station_id       734399
start_lat                 0
start_lng                 0
end_lat                   0
end_lng                   0
usertype                  0
dtype: int64

In [38]:
# Again, if I had the time and it would benefit a company that was employing me, I would try to see how many of these I could
# fix before deletion. However, I don't have access to a supercomputer and don't have a database of the station information that
# is comprehensive enough to be able to do so.

# Dropping columns with dual nulls
df2 = df2.dropna(subset=['from_station_name', 'from_station_id'], thresh=1)

# Will do the same for to_station_name and end_station_id
df2 = df2.dropna(subset=['to_station_name', 'end_station_id'], thresh=1)

In [39]:
df2.isnull().sum()

Unnamed: 0           0
ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    1
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
dtype: int64

In [40]:
df2[df2['from_station_name'].isnull()]

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
3668173,3668173,DE82A15026BA3056,electric_bike,2021-09-21 18:18:59,2021-09-21 18:21:48,,20215,Hegewisch Metra Station,20215,41.6485,-87.546089,41.648589,-87.54625,casual


In [41]:
df2[df2['from_station_id'] == '20215']

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
16893,16893,F355A001C7552ADD,classic_bike,2021-01-21 14:18:53,2021-01-21 14:45:50,Hegewisch Metra Station,20215,Burnham Greenway & 105th St,20222,41.648501,-87.546090,41.704575,-87.528232,member
334646,334646,23FE33AEC9C13C9C,classic_bike,2021-03-05 10:53:14,2021-03-05 11:16:26,Hegewisch Metra Station,20215,Burnham Greenway & 105th St,20222,41.648501,-87.546090,41.704575,-87.528232,member
334667,334667,0744ADDD14D52471,classic_bike,2021-03-10 11:01:20,2021-03-10 11:23:47,Hegewisch Metra Station,20215,Burnham Greenway & 105th St,20222,41.648501,-87.546090,41.704575,-87.528232,member
364275,364275,4640F42D34E739E9,classic_bike,2021-03-13 16:12:25,2021-03-13 16:46:01,Hegewisch Metra Station,20215,Ewing Ave & Burnham Greenway,20223,41.648501,-87.546090,41.712749,-87.534814,member
368683,368683,218643C9083A6E9D,classic_bike,2021-03-28 12:40:45,2021-03-28 12:49:16,Hegewisch Metra Station,20215,Hegewisch Metra Station,20215,41.648501,-87.546090,41.648501,-87.546090,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567438,4567438,28B71034C924011A,classic_bike,2021-10-11 13:45:16,2021-10-11 14:32:31,Hegewisch Metra Station,20215,Cottage Grove Ave & 111th Pl,20131,41.648501,-87.546090,41.691710,-87.610002,casual
4994644,4994644,4F6FB68D0D4CD300,electric_bike,2021-11-03 01:28:18,2021-11-03 01:33:32,Hegewisch Metra Station,20215,Commercial Ave & 130th St,20213,41.648591,-87.546205,41.659156,-87.550754,casual
4994661,4994661,8D392BC915B708F2,classic_bike,2021-11-04 15:07:43,2021-11-04 15:16:59,Hegewisch Metra Station,20215,Commercial Ave & 130th St,20213,41.648501,-87.546090,41.659150,-87.550762,casual
4994697,4994697,AD090077C2C9D53B,electric_bike,2021-11-09 21:52:42,2021-11-09 21:58:15,Hegewisch Metra Station,20215,Commercial Ave & 130th St,20213,41.648562,-87.546206,41.659135,-87.550712,casual


In [42]:
df2.loc[df2['from_station_id'] == '20215', 'from_station_name'] = 'Hegewisch Metra Station'

In [43]:
df2.isnull().sum()

Unnamed: 0           0
ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    0
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
dtype: int64

In [44]:
df2.shape

(4588303, 14)

In [45]:
df2.drop_duplicates()
df2.drop_duplicates(subset=['ride_id'])
df2.shape

(4588303, 14)

In [46]:
df2 = df2.drop('Unnamed: 0',axis=1)

In [47]:
df2

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
9,B9F73448DFBE0D45,classic_bike,2021-01-24 19:15:38,2021-01-24 19:22:51,California Ave & Cortez St,17660,Wood St & Augusta Blvd,657,41.900363,-87.696704,41.899181,-87.672200,member
10,457C7F4B5D3DA135,electric_bike,2021-01-23 12:57:38,2021-01-23 13:02:10,California Ave & Cortez St,17660,California Ave & North Ave,13258,41.900406,-87.696733,41.910435,-87.696890,member
11,57C750326F9FDABE,electric_bike,2021-01-09 15:28:04,2021-01-09 15:37:51,California Ave & Cortez St,17660,Wood St & Augusta Blvd,657,41.900374,-87.696688,41.899180,-87.672178,casual
12,4D518C65E338D070,electric_bike,2021-01-09 15:28:57,2021-01-09 15:37:54,California Ave & Cortez St,17660,Wood St & Augusta Blvd,657,41.900379,-87.696716,41.899149,-87.672177,casual
13,9D08A3AFF410474D,classic_bike,2021-01-24 15:56:59,2021-01-24 16:07:08,California Ave & Cortez St,17660,Wood St & Augusta Blvd,657,41.900363,-87.696704,41.899181,-87.672200,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595051,E06135F650553F21,electric_bike,2021-12-07 15:55:37,2021-12-07 16:00:17,Canal St & Madison St,13341,Desplaines St & Kinzie St,TA1306000003,41.881372,-87.640042,41.888456,-87.644336,casual
5595055,8DF0DBB049906332,electric_bike,2021-12-01 16:50:52,2021-12-01 16:55:18,Canal St & Madison St,13341,Desplaines St & Kinzie St,TA1306000003,41.881999,-87.639265,41.888415,-87.644342,casual
5595059,CF407BBC3B9FAD63,electric_bike,2021-12-06 19:37:50,2021-12-06 19:44:51,Canal St & Madison St,13341,Kingsbury St & Kinzie St,KA1503000043,41.882123,-87.640053,41.889106,-87.638862,member
5595060,60BB69EBF5440E92,electric_bike,2021-12-02 08:57:04,2021-12-02 09:05:21,Canal St & Madison St,13341,Dearborn St & Monroe St,TA1305000006,41.881956,-87.639955,41.880254,-87.629603,member


# 2022 Seperate

In [48]:
# Now to rinse and repeat 2022 dataset.

df3 = pd.read_csv('Divvy 2022 merged nostation.csv')
df3

Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012800,-87.665906,42.012560,-87.674367,casual
1,1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.012560,-87.674367,casual
2,2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.925330,-87.665800,member
3,3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual
4,4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.877850,-87.624080,41.884621,-87.627834,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5148171,5148171,BC3BFA659C9AB6F1,classic_bike,2022-10-30 01:41:29,2022-10-30 01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
5148172,5148172,ACD65450291CF95F,classic_bike,2022-10-30 01:41:54,2022-10-30 01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
5148173,5148173,4AAC03D1438E97CA,classic_bike,2022-10-15 09:34:11,2022-10-15 10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual
5148174,5148174,8E6F3F29785E5D40,classic_bike,2022-10-09 10:21:34,2022-10-09 10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member


In [49]:
df3.isnull().sum()

Unnamed: 0                0
ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name    751824
from_station_id      751824
to_station_name      807325
end_station_id       807325
start_lat                 0
start_lng                 0
end_lat                5500
end_lng                5500
usertype                  0
dtype: int64

In [50]:
set(df3[df3['end_lat'].isnull()].index == df3[df3['end_lng'].isnull()].index)

{True}

In [51]:
df3.drop(df3[df3['end_lat'].isnull()].index, inplace=True)

In [52]:
df3 = df3.drop('Unnamed: 0', axis=1)

In [53]:
df3.isnull().sum()

ride_id                   0
rideable_type             0
start_time                0
end_time                  0
from_station_name    751824
from_station_id      751824
to_station_name      801825
end_station_id       801825
start_lat                 0
start_lng                 0
end_lat                   0
end_lng                   0
usertype                  0
dtype: int64

In [54]:
df3 = df3.dropna(subset=['from_station_name', 'from_station_id'], thresh=1)

# Will do the same for to_station_name and end_station_id
df3 = df3.dropna(subset=['to_station_name', 'end_station_id'], thresh=1)

In [55]:
df3.isnull().sum()

ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    0
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
dtype: int64

In [56]:
df3.shape

(3978163, 13)

In [57]:
df3.drop_duplicates()
df3.drop_duplicates(subset=['ride_id'])
df3.shape

(3978163, 13)

# Combining Dataframes

In [58]:
merged2020_2022 = pd.concat([df1, df2, df3], ignore_index=True)

In [59]:
merged2020_2022 = merged2020_2022.drop('index', axis=1)

In [60]:
merged2020_2022.drop_duplicates()
merged2020_2022.drop_duplicates(subset=['ride_id'])
merged2020_2022

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member
2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member
3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member
4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11956505,BC3BFA659C9AB6F1,classic_bike,2022-10-30 01:41:29,2022-10-30 01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11956506,ACD65450291CF95F,classic_bike,2022-10-30 01:41:54,2022-10-30 01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11956507,4AAC03D1438E97CA,classic_bike,2022-10-15 09:34:11,2022-10-15 10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual
11956508,8E6F3F29785E5D40,classic_bike,2022-10-09 10:21:34,2022-10-09 10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member


In [61]:
merged2020_2022[merged2020_2022['from_station_name'] == 'DIVVY CASSETTE REPAIR MOBILE STATION']

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
5131038,04B18873381AACA9,classic_bike,2021-07-07 11:01:33,2021-07-07 11:14:59,DIVVY CASSETTE REPAIR MOBILE STATION,DIVVY CASSETTE REPAIR MOBILE STATION,Field Museum,13029,41.880958,-87.616743,41.865312,-87.617867,casual
5217433,A4354906AD557A6C,electric_bike,2021-07-07 11:06:50,2021-07-07 13:06:50,DIVVY CASSETTE REPAIR MOBILE STATION,DIVVY CASSETTE REPAIR MOBILE STATION,Lake Shore Dr & Monroe St,13300,41.881039,-87.61673,41.881225,-87.61675,casual
5510826,74B6C966F82A9FD5,electric_bike,2021-07-07 11:06:19,2021-07-07 13:06:49,DIVVY CASSETTE REPAIR MOBILE STATION,DIVVY CASSETTE REPAIR MOBILE STATION,Lake Shore Dr & Monroe St,13300,41.881051,-87.616766,41.881165,-87.616763,casual
5510955,C0D3A64896642496,classic_bike,2021-07-07 11:03:00,2021-07-07 13:06:22,DIVVY CASSETTE REPAIR MOBILE STATION,DIVVY CASSETTE REPAIR MOBILE STATION,Lake Shore Dr & Monroe St,13300,41.880958,-87.616743,41.880958,-87.616743,casual


In [62]:
# Remember, we don't want test values to obscure our actualy data. This means that all values with 'test' in the station names
# need to be found and deleted.

merged2020_2022[merged2020_2022['from_station_name'].str.lower().str.contains("test", regex=False, na=False) == True]

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
77901,7A46E4BD3F074400,docked_bike,2020-01-30 16:59:14,2020-01-30 17:26:42,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Lincoln Ave & Belle Plaine Ave,298.0,41.890000,-87.680700,41.956000,-87.680300,member
148557,B869D5896CB1BE5F,docked_bike,2020-02-21 17:09:01,2020-02-21 17:35:25,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Oakley Ave & Irving Park Rd,486.0,41.890000,-87.680700,41.954300,-87.686100,member
371556,A4E016418635549A,docked_bike,2020-03-31 17:15:48,2020-03-31 17:45:19,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Oakley Ave & Irving Park Rd,486.0,41.890000,-87.680700,41.954300,-87.686100,member
372682,73D0B881C63D1736,docked_bike,2020-03-26 17:30:02,2020-03-26 17:53:42,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Lincoln Ave & Waveland Ave,257.0,41.890000,-87.680700,41.948800,-87.675300,member
436240,01FD073BD38D7552,docked_bike,2020-04-24 17:35:34,2020-04-24 17:35:39,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,41.890000,-87.680700,41.890000,-87.680700,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3121165,35708F6806F43F9C,electric_bike,2020-11-09 11:39:26,2020-11-09 11:43:32,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Wood St & Chicago Ave,637.0,41.889949,-87.680233,41.895539,-87.672092,member
3264119,7B23D12183D82FE1,electric_bike,2020-11-15 14:08:06,2020-11-15 14:11:04,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Artesian Ave & Hubbard St,376.0,41.889961,-87.680281,41.889528,-87.688076,member
3264122,AB50D5777D630144,electric_bike,2020-11-08 14:02:30,2020-11-08 14:05:46,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),671.0,Artesian Ave & Hubbard St,376.0,41.889984,-87.680237,41.889509,-87.688109,member
3295010,FED70F7BEAEF5E70,electric_bike,2020-12-04 12:20:23,2020-12-04 12:22:33,HUBBARD ST BIKE CHECKING (LBS-WH-TEST),Hubbard Bike-checking (LBS-WH-TEST),HUBBARD ST BIKE CHECKING (LBS-WH-TEST),Hubbard Bike-checking (LBS-WH-TEST),41.889935,-87.680188,41.889949,-87.680265,member


In [63]:
merged2020_2022.shape

(11956510, 13)

In [64]:
merged2020_2022.drop(merged2020_2022[merged2020_2022['from_station_name'].str.lower().str.contains("test", regex=False, na=False) == True].index)

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member
2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member
3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member
4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11956505,BC3BFA659C9AB6F1,classic_bike,2022-10-30 01:41:29,2022-10-30 01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11956506,ACD65450291CF95F,classic_bike,2022-10-30 01:41:54,2022-10-30 01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11956507,4AAC03D1438E97CA,classic_bike,2022-10-15 09:34:11,2022-10-15 10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual
11956508,8E6F3F29785E5D40,classic_bike,2022-10-09 10:21:34,2022-10-09 10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member


In [65]:
# so we can delete all rows with occurances of the phrase
merged2020_2022 = merged2020_2022.drop(merged2020_2022[merged2020_2022['from_station_name'].str.lower().str.contains("test", regex=False, na=False) == True].index)
# now repeat for to_station_names
merged2020_2022 = merged2020_2022.drop(merged2020_2022[merged2020_2022['to_station_name'].str.lower().str.contains("test", regex=False, na=False) == True].index)

In [66]:
merged2020_2022.isnull().sum()

ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    0
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
dtype: int64

In [67]:
# I will need to remain concious of this going forward, when I do the final data cleaning on all of the data at once. I will
# make sure to do a comprehensive over-view of all of the names that are assigned to station id's in the final dataset, to
# correct inconsistencies as well as delete any more of these types of entries that involve company testing/facilities

In [68]:
merged2020_2022 = merged2020_2022.reset_index()

In [69]:
merged2020_2022

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype
0,0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member
1,1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member
2,2,789F3C21E472CA96,docked_bike,2020-01-09 19:29:26,2020-01-09 19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member
3,3,C9A388DAC6ABF313,docked_bike,2020-01-06 16:17:07,2020-01-06 16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member
4,4,943BC3CBECCFD662,docked_bike,2020-01-30 08:37:16,2020-01-30 08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,11956505,BC3BFA659C9AB6F1,classic_bike,2022-10-30 01:41:29,2022-10-30 01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11952926,11956506,ACD65450291CF95F,classic_bike,2022-10-30 01:41:54,2022-10-30 01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual
11952927,11956507,4AAC03D1438E97CA,classic_bike,2022-10-15 09:34:11,2022-10-15 10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual
11952928,11956508,8E6F3F29785E5D40,classic_bike,2022-10-09 10:21:34,2022-10-09 10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member


In [70]:
# Now to split time columns

merged2020_2022[['start_date', 'start_time']] = merged2020_2022.start_time.str.split(" ", expand=True)
merged2020_2022[['end_date', 'end_time']] = merged2020_2022.end_time.str.split(" ", expand=True)
merged2020_2022

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype,start_date,end_date
0,0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member,2020-01-21,2020-01-21
1,1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member,2020-01-30,2020-01-30
2,2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member,2020-01-09,2020-01-09
3,3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member,2020-01-06,2020-01-06
4,4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member,2020-01-30,2020-01-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,11956505,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30
11952926,11956506,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30
11952927,11956507,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual,2022-10-15,2022-10-15
11952928,11956508,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member,2022-10-09,2022-10-09


In [71]:
merged2020_2022 = merged2020_2022.drop('index', axis=1)

In [72]:
merged2020_2022

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype,start_date,end_date
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member,2020-01-21,2020-01-21
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member,2020-01-30,2020-01-30
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member,2020-01-09,2020-01-09
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member,2020-01-06,2020-01-06
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member,2020-01-30,2020-01-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual,2022-10-15,2022-10-15
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member,2022-10-09,2022-10-09


# Stations merge for landmark and dpcapcity data

In [73]:
# I am now going to merge this dataset with the 'stations_master_list' data, because we can add to/from landmark columns, as
# well as to/from city columns.
stations = pd.read_csv('master_station_list.csv')
stations = stations.drop(['Unnamed: 0'], axis=1)

In [74]:
stations

Unnamed: 0,id,name,latitude,longitude,dpcapacity,landmark,city
0,5,State St & Harrison St,41.873958,-87.627739,19,30.0,Chicago
1,13,Wilton Ave & Diversey Pkwy,41.932500,-87.652681,19,66.0,Chicago
2,14,Morgan St & 18th St,41.858086,-87.651073,15,163.0,Chicago
3,15,Racine Ave & 18th St,41.858181,-87.656487,15,164.0,Chicago
4,16,Wood St & North Ave,41.910329,-87.672516,15,223.0,Chicago
...,...,...,...,...,...,...,...
581,622,California Ave & Cortez St,41.900363,-87.696704,0,0.0,Chicago
582,623,Michigan Ave & 8th St,41.872773,-87.623981,23,0.0,Chicago
583,624,Dearborn St & Van Buren St (*),41.876268,-87.629155,16,0.0,Chicago
584,625,Chicago Ave & Dempster St,42.041691,-87.680687,15,0.0,Evanston


In [75]:
stations.dtypes

id              int64
name           object
latitude      float64
longitude     float64
dpcapacity      int64
landmark      float64
city           object
dtype: object

In [76]:
stations = stations.astype({'id' : 'float'})
stations = stations.astype({'id' : 'string'})

In [77]:
stations

Unnamed: 0,id,name,latitude,longitude,dpcapacity,landmark,city
0,5.0,State St & Harrison St,41.873958,-87.627739,19,30.0,Chicago
1,13.0,Wilton Ave & Diversey Pkwy,41.932500,-87.652681,19,66.0,Chicago
2,14.0,Morgan St & 18th St,41.858086,-87.651073,15,163.0,Chicago
3,15.0,Racine Ave & 18th St,41.858181,-87.656487,15,164.0,Chicago
4,16.0,Wood St & North Ave,41.910329,-87.672516,15,223.0,Chicago
...,...,...,...,...,...,...,...
581,622.0,California Ave & Cortez St,41.900363,-87.696704,0,0.0,Chicago
582,623.0,Michigan Ave & 8th St,41.872773,-87.623981,23,0.0,Chicago
583,624.0,Dearborn St & Van Buren St (*),41.876268,-87.629155,16,0.0,Chicago
584,625.0,Chicago Ave & Dempster St,42.041691,-87.680687,15,0.0,Evanston


In [78]:
# I will drop latitude and longitude, since we already have these. I will include dpcapcity, but only for now. Merging on 'id'

In [79]:
stations = stations.drop(['latitude', 'longitude', 'name'], axis=1)

# 'from_city','from_landmark',
from_city = stations.rename(columns={'id' : 'from_station_id', 'city' : 'from_city', 'landmark' : 'from_landmark', 'dpcapacity' : 'from_dpcapacity'})

# 'to_city', 'to_landmark'
to_city = stations.rename(columns={'id' : 'end_station_id', 'city' : 'to_city', 'landmark' : 'to_landmark', 'dpcapacity' : 'to_dpcapacity'})

In [80]:
from_city

Unnamed: 0,from_station_id,from_dpcapacity,from_landmark,from_city
0,5.0,19,30.0,Chicago
1,13.0,19,66.0,Chicago
2,14.0,15,163.0,Chicago
3,15.0,15,164.0,Chicago
4,16.0,15,223.0,Chicago
...,...,...,...,...
581,622.0,0,0.0,Chicago
582,623.0,23,0.0,Chicago
583,624.0,16,0.0,Chicago
584,625.0,15,0.0,Evanston


In [81]:
to_city

Unnamed: 0,end_station_id,to_dpcapacity,to_landmark,to_city
0,5.0,19,30.0,Chicago
1,13.0,19,66.0,Chicago
2,14.0,15,163.0,Chicago
3,15.0,15,164.0,Chicago
4,16.0,15,223.0,Chicago
...,...,...,...,...
581,622.0,0,0.0,Chicago
582,623.0,23,0.0,Chicago
583,624.0,16,0.0,Chicago
584,625.0,15,0.0,Evanston


In [82]:
to_city

Unnamed: 0,end_station_id,to_dpcapacity,to_landmark,to_city
0,5.0,19,30.0,Chicago
1,13.0,19,66.0,Chicago
2,14.0,15,163.0,Chicago
3,15.0,15,164.0,Chicago
4,16.0,15,223.0,Chicago
...,...,...,...,...
581,622.0,0,0.0,Chicago
582,623.0,23,0.0,Chicago
583,624.0,16,0.0,Chicago
584,625.0,15,0.0,Evanston


In [83]:
k = pd.merge(merged2020_2022, from_city, on='from_station_id', how='left')

In [84]:
k

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,41.967100,-87.667400,member,2020-01-21,2020-01-21,,,
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,41.954200,-87.664400,member,2020-01-30,2020-01-30,,,
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,41.940200,-87.653000,member,2020-01-09,2020-01-09,,,
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,41.891800,-87.620600,member,2020-01-06,2020-01-06,,,
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,41.889900,-87.634300,member,2020-01-30,2020-01-30,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30,,,
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,41.943350,-87.670668,casual,2022-10-30,2022-10-30,,,
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,41.891466,-87.626761,casual,2022-10-15,2022-10-15,,,
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,41.931931,-87.677856,member,2022-10-09,2022-10-09,,,


In [85]:
k_final = pd.merge(k, to_city, on='end_station_id', how='left')

In [86]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,,,,,,
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,,,,,,
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,,,,,,
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,,,,,,
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,,,,,
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,,,,,
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,,,,,,
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,,,,,,


In [87]:
k_final.isnull().sum()

ride_id                     0
rideable_type               0
start_time                  0
end_time                    0
from_station_name           0
from_station_id             0
to_station_name             0
end_station_id              0
start_lat                   0
start_lng                   0
end_lat                     0
end_lng                     0
usertype                    0
start_date                  0
end_date                    0
from_dpcapacity      11946646
from_landmark        11946646
from_city            11946646
to_dpcapacity        11946875
to_landmark          11946875
to_city              11946875
dtype: int64

In [88]:
stations

Unnamed: 0,id,dpcapacity,landmark,city
0,5.0,19,30.0,Chicago
1,13.0,19,66.0,Chicago
2,14.0,15,163.0,Chicago
3,15.0,15,164.0,Chicago
4,16.0,15,223.0,Chicago
...,...,...,...,...
581,622.0,0,0.0,Chicago
582,623.0,23,0.0,Chicago
583,624.0,16,0.0,Chicago
584,625.0,15,0.0,Evanston


In [89]:
stations_and_landmarks = {}
ids = stations['id'].values
landmarks = stations['landmark'].values

for i in range(len(ids)):
    stations_and_landmarks[f'{ids[i]}'] = landmarks[i]
stations_and_landmarks

{'5.0': 30.0,
 '13.0': 66.0,
 '14.0': 163.0,
 '15.0': 164.0,
 '16.0': 223.0,
 '17.0': 246.0,
 '19.0': 139.0,
 '20.0': 154.0,
 '21.0': 157.0,
 '22.0': 160.0,
 '23.0': 172.0,
 '24.0': 262.0,
 '25.0': 34.0,
 '26.0': 51.0,
 '27.0': 174.0,
 '28.0': 282.0,
 '29.0': 290.0,
 '30.0': 248.0,
 '31.0': 17.0,
 '32.0': 76.0,
 '33.0': 3.0,
 '34.0': 124.0,
 '35.0': 22.0,
 '36.0': 19.0,
 '37.0': 20.0,
 '42.0': 170.0,
 '43.0': 1.0,
 '44.0': 2.0,
 '45.0': 40.0,
 '46.0': 46.0,
 '47.0': 50.0,
 '48.0': 12.0,
 '49.0': 49.0,
 '50.0': 9.0,
 '51.0': 5.0,
 '52.0': 43.0,
 '53.0': 64.0,
 '54.0': 44.0,
 '55.0': 166.0,
 '56.0': 39.0,
 '57.0': 327.0,
 '58.0': 25.0,
 '59.0': 31.0,
 '60.0': 58.0,
 '61.0': 221.0,
 '62.0': 106.0,
 '66.0': 21.0,
 '67.0': 60.0,
 '68.0': 37.0,
 '69.0': 65.0,
 '71.0': 72.0,
 '72.0': 148.0,
 '73.0': 38.0,
 '74.0': 265.0,
 '75.0': 138.0,
 '76.0': 300.0,
 '77.0': 301.0,
 '80.0': 156.0,
 '81.0': 23.0,
 '84.0': 33.0,
 '85.0': 42.0,
 '86.0': 289.0,
 '87.0': 189.0,
 '88.0': 155.0,
 '90.0': 8.0,
 '9

In [90]:
new_column_from_station = []
run_ids_from = k_final['from_station_id'].values


for i in range(len(run_ids_from)):
    try:
        new_column_from_station.append(stations_and_landmarks[f'{run_ids_from[i]}'])
    except KeyError:
        new_column_from_station.append(0.0)
        pass
    
    
    
new_column_from_station   

[112.0,
 316.0,
 277.0,
 5.0,
 21.0,
 264.0,
 535.0,
 535.0,
 264.0,
 547.0,
 69.0,
 430.0,
 10.0,
 10.0,
 84.0,
 19.0,
 66.0,
 19.0,
 226.0,
 532.0,
 300.0,
 11.0,
 309.0,
 323.0,
 539.0,
 11.0,
 241.0,
 19.0,
 45.0,
 19.0,
 208.0,
 66.0,
 0.0,
 145.0,
 243.0,
 353.0,
 186.0,
 171.0,
 12.0,
 33.0,
 63.0,
 39.0,
 21.0,
 197.0,
 196.0,
 78.0,
 141.0,
 310.0,
 310.0,
 622.0,
 141.0,
 141.0,
 374.0,
 53.0,
 353.0,
 277.0,
 353.0,
 23.0,
 341.0,
 57.0,
 263.0,
 339.0,
 263.0,
 145.0,
 145.0,
 4.0,
 194.0,
 22.0,
 263.0,
 145.0,
 263.0,
 263.0,
 4.0,
 263.0,
 61.0,
 241.0,
 21.0,
 408.0,
 456.0,
 408.0,
 1.0,
 456.0,
 456.0,
 534.0,
 148.0,
 263.0,
 263.0,
 232.0,
 112.0,
 85.0,
 643.0,
 6.0,
 275.0,
 26.0,
 85.0,
 643.0,
 140.0,
 66.0,
 524.0,
 211.0,
 1.0,
 10.0,
 211.0,
 10.0,
 210.0,
 210.0,
 137.0,
 527.0,
 210.0,
 52.0,
 475.0,
 480.0,
 475.0,
 1.0,
 448.0,
 1.0,
 43.0,
 5.0,
 0.0,
 168.0,
 681.0,
 168.0,
 0.0,
 354.0,
 0.0,
 385.0,
 72.0,
 409.0,
 345.0,
 448.0,
 420.0,
 83.0,
 266.0

In [91]:
len(new_column_from_station)

11952930

In [92]:
k_final.shape

(11952930, 21)

In [93]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,,,,,,
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,,,,,,
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,,,,,,
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,,,,,,
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,,,,,
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,,,,,
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,,,,,,
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,,,,,,


In [94]:
k_final['from_landmark'] = new_column_from_station

In [95]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,,112.0,,,,
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,,316.0,,,,
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,,277.0,,,,
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,,5.0,,,,
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,,21.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,0.0,,,,
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,0.0,,,,
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,,0.0,,,,
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,,0.0,,,,


In [96]:
new_column_to_station = []
run_ids_to = k_final['end_station_id'].values


for i in range(len(run_ids_to)):
    try:
        new_column_to_station.append(stations_and_landmarks[f'{run_ids_to[i]}'])
    except KeyError:
        new_column_to_station.append(0.0)
        pass
new_column_to_station   

[239.0,
 321.0,
 69.0,
 262.0,
 264.0,
 535.0,
 264.0,
 264.0,
 535.0,
 13.0,
 0.0,
 10.0,
 430.0,
 430.0,
 25.0,
 226.0,
 19.0,
 226.0,
 19.0,
 303.0,
 1.0,
 1.0,
 11.0,
 63.0,
 1.0,
 539.0,
 432.0,
 344.0,
 7.0,
 159.0,
 0.0,
 210.0,
 66.0,
 276.0,
 142.0,
 53.0,
 12.0,
 186.0,
 186.0,
 21.0,
 321.0,
 196.0,
 196.0,
 196.0,
 21.0,
 273.0,
 623.0,
 534.0,
 534.0,
 642.0,
 623.0,
 353.0,
 273.0,
 353.0,
 53.0,
 123.0,
 23.0,
 353.0,
 36.0,
 263.0,
 4.0,
 263.0,
 4.0,
 20.0,
 4.0,
 145.0,
 524.0,
 534.0,
 4.0,
 78.0,
 4.0,
 4.0,
 43.0,
 4.0,
 241.0,
 6.0,
 531.0,
 456.0,
 408.0,
 456.0,
 534.0,
 408.0,
 408.0,
 2.0,
 456.0,
 4.0,
 4.0,
 67.0,
 616.0,
 643.0,
 85.0,
 64.0,
 232.0,
 275.0,
 643.0,
 85.0,
 211.0,
 532.0,
 275.0,
 52.0,
 301.0,
 1.0,
 52.0,
 1.0,
 210.0,
 137.0,
 210.0,
 52.0,
 527.0,
 210.0,
 480.0,
 475.0,
 480.0,
 10.0,
 90.0,
 0.0,
 0.0,
 0.0,
 547.0,
 0.0,
 168.0,
 0.0,
 354.0,
 0.0,
 4.0,
 384.0,
 0.0,
 11.0,
 448.0,
 420.0,
 448.0,
 448.0,
 0.0,
 266.0,
 229.0,
 154.

In [97]:
k_final['to_landmark'] = new_column_to_station

In [98]:
k_final.isnull().sum()

ride_id                     0
rideable_type               0
start_time                  0
end_time                    0
from_station_name           0
from_station_id             0
to_station_name             0
end_station_id              0
start_lat                   0
start_lng                   0
end_lat                     0
end_lng                     0
usertype                    0
start_date                  0
end_date                    0
from_dpcapacity      11946646
from_landmark               0
from_city            11946646
to_dpcapacity        11946875
to_landmark                 0
to_city              11946875
dtype: int64

In [99]:
landmarks_and_city = {}
city = stations['city'].values
landmarks = stations['landmark'].values

for i in range(len(ids)):
    landmarks_and_city[f'{landmarks[i]}'] = city[i]
landmarks_and_city

{'30.0': 'Chicago',
 '66.0': 'Chicago',
 '163.0': 'Chicago',
 '164.0': 'Chicago',
 '223.0': 'Chicago',
 '246.0': 'Chicago',
 '139.0': 'Chicago',
 '154.0': 'Chicago',
 '157.0': 'Chicago',
 '160.0': 'Chicago',
 '172.0': 'Chicago',
 '262.0': 'Chicago',
 '34.0': 'Chicago',
 '51.0': 'Chicago',
 '174.0': 'Chicago',
 '282.0': 'Chicago',
 '290.0': 'Chicago',
 '248.0': 'Chicago',
 '17.0': 'Chicago',
 '76.0': 'Chicago',
 '3.0': 'Chicago',
 '124.0': 'Chicago',
 '22.0': 'Chicago',
 '19.0': 'Chicago',
 '20.0': 'Chicago',
 '170.0': 'Chicago',
 '1.0': 'Chicago',
 '2.0': 'Chicago',
 '40.0': 'Chicago',
 '46.0': 'Chicago',
 '50.0': 'Chicago',
 '12.0': 'Chicago',
 '49.0': 'Chicago',
 '9.0': 'Chicago',
 '5.0': 'Chicago',
 '43.0': 'Chicago',
 '64.0': 'Chicago',
 '44.0': 'Chicago',
 '166.0': 'Chicago',
 '39.0': 'Chicago',
 '327.0': 'Chicago',
 '25.0': 'Chicago',
 '31.0': 'Chicago',
 '58.0': 'Chicago',
 '221.0': 'Chicago',
 '106.0': 'Chicago',
 '21.0': 'Chicago',
 '60.0': 'Chicago',
 '37.0': 'Chicago',
 '65.

In [100]:
new_column_from_city = []
run_ids_from2 = k_final['from_landmark'].values


for i in range(len(run_ids_from2)):
    try:
        new_column_from_city.append(landmarks_and_city[f'{run_ids_from2[i]}'])
    except KeyError:
        new_column_from_city.append(0.0)
        pass
new_column_from_city

['Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Ch

In [101]:
set(new_column_from_city)

{'Chicago'}

In [102]:
k_final['from_city'] = new_column_from_city

In [103]:
new_column_to_city = []
run_ids_to3 = k_final['to_landmark'].values


for i in range(len(run_ids_to3)):
    try:
        new_column_to_city.append(landmarks_and_city[f'{run_ids_to3[i]}'])
    except KeyError:
        new_column_to_city.append(0.0)
        pass
new_column_to_city

['Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Chicago',
 'Ch

In [104]:
set(new_column_to_city)

{'Chicago'}

In [105]:
k_final['to_city'] = new_column_to_city

In [106]:
k_final.isnull().sum()

ride_id                     0
rideable_type               0
start_time                  0
end_time                    0
from_station_name           0
from_station_id             0
to_station_name             0
end_station_id              0
start_lat                   0
start_lng                   0
end_lat                     0
end_lng                     0
usertype                    0
start_date                  0
end_date                    0
from_dpcapacity      11946646
from_landmark               0
from_city                   0
to_dpcapacity        11946875
to_landmark                 0
to_city                     0
dtype: int64

In [107]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,,112.0,Chicago,,239.0,Chicago
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,,316.0,Chicago,,321.0,Chicago
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,,277.0,Chicago,,69.0,Chicago
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,,5.0,Chicago,,262.0,Chicago
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,,21.0,Chicago,,264.0,Chicago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,0.0,Chicago,,0.0,Chicago
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,,0.0,Chicago,,0.0,Chicago
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,,0.0,Chicago,,0.0,Chicago
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,,0.0,Chicago,,0.0,Chicago


In [108]:
k_final.isnull().sum()

ride_id                     0
rideable_type               0
start_time                  0
end_time                    0
from_station_name           0
from_station_id             0
to_station_name             0
end_station_id              0
start_lat                   0
start_lng                   0
end_lat                     0
end_lng                     0
usertype                    0
start_date                  0
end_date                    0
from_dpcapacity      11946646
from_landmark               0
from_city                   0
to_dpcapacity        11946875
to_landmark                 0
to_city                     0
dtype: int64

In [109]:
station_id_and_dpcapacity = {}
station_id = stations['id'].values
dpcapacity = stations['dpcapacity'].values

for i in range(len(ids)):
    station_id_and_dpcapacity[f'{station_id[i]}'] = dpcapacity[i]
station_id_and_dpcapacity

{'5.0': 19,
 '13.0': 19,
 '14.0': 15,
 '15.0': 15,
 '16.0': 15,
 '17.0': 15,
 '19.0': 15,
 '20.0': 15,
 '21.0': 15,
 '22.0': 15,
 '23.0': 15,
 '24.0': 15,
 '25.0': 23,
 '26.0': 23,
 '27.0': 19,
 '28.0': 15,
 '29.0': 15,
 '30.0': 15,
 '31.0': 23,
 '32.0': 19,
 '33.0': 27,
 '34.0': 15,
 '35.0': 35,
 '36.0': 27,
 '37.0': 19,
 '42.0': 15,
 '43.0': 43,
 '44.0': 27,
 '45.0': 15,
 '46.0': 19,
 '47.0': 15,
 '48.0': 27,
 '49.0': 23,
 '50.0': 27,
 '51.0': 31,
 '52.0': 23,
 '53.0': 19,
 '54.0': 19,
 '55.0': 15,
 '56.0': 19,
 '57.0': 15,
 '58.0': 15,
 '59.0': 19,
 '60.0': 19,
 '61.0': 15,
 '62.0': 15,
 '66.0': 19,
 '67.0': 15,
 '68.0': 23,
 '69.0': 19,
 '71.0': 15,
 '72.0': 15,
 '73.0': 19,
 '74.0': 23,
 '75.0': 35,
 '76.0': 11,
 '77.0': 23,
 '80.0': 19,
 '81.0': 47,
 '84.0': 19,
 '85.0': 15,
 '86.0': 15,
 '87.0': 19,
 '88.0': 15,
 '90.0': 35,
 '91.0': 31,
 '92.0': 19,
 '93.0': 15,
 '94.0': 19,
 '97.0': 19,
 '98.0': 15,
 '99.0': 19,
 '100.0': 23,
 '106.0': 27,
 '108.0': 19,
 '109.0': 19,
 '110.0':

In [110]:
new_column_dpcapacity_from = []
run_ids_to4 = k_final['from_station_id'].values


for i in range(len(run_ids_to4)):
    try:
        new_column_dpcapacity_from.append(station_id_and_dpcapacity[f'{run_ids_to4[i]}'])
    except KeyError:
        new_column_dpcapacity_from.append(0)
        pass
new_column_dpcapacity_from

[15,
 19,
 15,
 31,
 19,
 19,
 23,
 23,
 19,
 27,
 23,
 19,
 31,
 31,
 15,
 27,
 19,
 27,
 15,
 27,
 11,
 27,
 19,
 15,
 23,
 27,
 15,
 27,
 23,
 27,
 15,
 19,
 0,
 19,
 15,
 15,
 15,
 15,
 27,
 19,
 11,
 19,
 19,
 15,
 19,
 19,
 15,
 19,
 19,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 15,
 47,
 23,
 23,
 23,
 11,
 23,
 19,
 19,
 15,
 15,
 35,
 23,
 19,
 23,
 23,
 15,
 23,
 19,
 15,
 19,
 15,
 15,
 15,
 43,
 15,
 15,
 19,
 15,
 23,
 23,
 15,
 15,
 19,
 15,
 15,
 19,
 15,
 19,
 15,
 15,
 19,
 19,
 19,
 43,
 31,
 19,
 31,
 15,
 15,
 15,
 15,
 15,
 19,
 19,
 15,
 19,
 43,
 19,
 43,
 23,
 31,
 0,
 15,
 19,
 15,
 0,
 15,
 0,
 15,
 15,
 15,
 15,
 19,
 15,
 15,
 11,
 0,
 19,
 19,
 15,
 15,
 15,
 27,
 27,
 31,
 27,
 19,
 15,
 27,
 15,
 15,
 15,
 15,
 11,
 23,
 15,
 23,
 15,
 11,
 23,
 19,
 27,
 31,
 47,
 19,
 15,
 0,
 15,
 19,
 19,
 19,
 15,
 0,
 19,
 19,
 15,
 15,
 23,
 47,
 47,
 23,
 23,
 23,
 19,
 19,
 0,
 0,
 27,
 15,
 27,
 27,
 23,
 15,
 15,
 27,
 15,
 15,
 11,
 27,
 23,
 15,
 15,
 15,
 15,
 15,

In [111]:
stations

Unnamed: 0,id,dpcapacity,landmark,city
0,5.0,19,30.0,Chicago
1,13.0,19,66.0,Chicago
2,14.0,15,163.0,Chicago
3,15.0,15,164.0,Chicago
4,16.0,15,223.0,Chicago
...,...,...,...,...
581,622.0,0,0.0,Chicago
582,623.0,23,0.0,Chicago
583,624.0,16,0.0,Chicago
584,625.0,15,0.0,Evanston


In [112]:
# Checking we added actual values instead of just 0.0
max(new_column_dpcapacity_from)

47

In [113]:
k_final['from_dpcapacity'] = new_column_dpcapacity_from

In [114]:
k_final.isnull().sum()

ride_id                     0
rideable_type               0
start_time                  0
end_time                    0
from_station_name           0
from_station_id             0
to_station_name             0
end_station_id              0
start_lat                   0
start_lng                   0
end_lat                     0
end_lng                     0
usertype                    0
start_date                  0
end_date                    0
from_dpcapacity             0
from_landmark               0
from_city                   0
to_dpcapacity        11946875
to_landmark                 0
to_city                     0
dtype: int64

In [115]:
new_column_dpcapacity_to = []
run_ids_to5 = k_final['end_station_id'].values


for i in range(len(run_ids_to4)):
    try:
        new_column_dpcapacity_to.append(station_id_and_dpcapacity[f'{run_ids_to5[i]}'])
    except KeyError:
        new_column_dpcapacity_to.append(0)
        pass
new_column_dpcapacity_to

[11,
 15,
 23,
 15,
 19,
 23,
 19,
 19,
 23,
 23,
 0,
 31,
 19,
 19,
 15,
 15,
 27,
 15,
 27,
 19,
 43,
 43,
 27,
 11,
 43,
 23,
 15,
 19,
 15,
 19,
 0,
 15,
 19,
 15,
 15,
 15,
 27,
 15,
 15,
 19,
 15,
 19,
 19,
 19,
 19,
 15,
 15,
 19,
 19,
 19,
 15,
 15,
 15,
 15,
 15,
 19,
 47,
 15,
 19,
 23,
 15,
 23,
 15,
 19,
 15,
 19,
 19,
 19,
 15,
 19,
 15,
 15,
 23,
 15,
 15,
 15,
 19,
 15,
 15,
 15,
 19,
 15,
 15,
 27,
 15,
 15,
 15,
 23,
 15,
 15,
 19,
 19,
 15,
 19,
 15,
 19,
 19,
 27,
 19,
 19,
 23,
 43,
 19,
 43,
 15,
 15,
 15,
 19,
 15,
 15,
 15,
 19,
 15,
 31,
 11,
 0,
 0,
 0,
 27,
 0,
 15,
 0,
 15,
 0,
 15,
 15,
 0,
 27,
 19,
 15,
 19,
 19,
 0,
 11,
 15,
 15,
 15,
 15,
 15,
 19,
 31,
 27,
 23,
 19,
 19,
 15,
 19,
 19,
 19,
 15,
 15,
 31,
 15,
 23,
 31,
 15,
 27,
 15,
 23,
 27,
 27,
 15,
 19,
 23,
 19,
 15,
 19,
 23,
 19,
 15,
 15,
 19,
 15,
 19,
 47,
 23,
 23,
 47,
 47,
 47,
 31,
 0,
 19,
 19,
 19,
 27,
 31,
 19,
 27,
 15,
 23,
 15,
 15,
 19,
 11,
 19,
 19,
 15,
 23,
 23,
 23,
 15,
 

In [116]:
max(new_column_dpcapacity_to)

47

In [117]:
k_final['to_dpcapacity'] = new_column_dpcapacity_to

In [118]:
k_final.isnull().sum()

ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    0
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
start_date           0
end_date             0
from_dpcapacity      0
from_landmark        0
from_city            0
to_dpcapacity        0
to_landmark          0
to_city              0
dtype: int64

In [119]:
# making sure our values are different
new_column_dpcapacity_to == new_column_dpcapacity_from

False

In [120]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago


In [121]:
# need to add a trip_duration, take into account overnight/ date changes when trying to figure out time difference between
# start of trip and end of trip, which means we will need to first build to handle cases where start/end date remains the same, 
# and then handle all cases where this isn't true

In [122]:
display(k_final)

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,end_lng,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,-87.667400,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,-87.664400,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,-87.653000,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,-87.620600,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,-87.634300,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,-87.670668,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,-87.626761,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,-87.677856,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago


In [123]:
start_times = k_final['start_time'].values

In [124]:
# figure out rules for a single start/end ride time

input_time = '20:06:59'
input_for_convert = input_time.split(':')
input_for_convert
seconds = (int(input_for_convert[0]) * 360) + (int(input_for_convert[1]) * 60) + (int(input_for_convert[2]))
seconds


input_time2 = '20:14:30'
input_for_convert2 = input_time2.split(':')
input_for_convert2
seconds2 = (int(input_for_convert2[0]) * 360) + (int(input_for_convert2[1]) * 60) + (int(input_for_convert2[2]))
seconds2

# if ride does not carry over a day
if seconds < seconds2:
    ride_duration = seconds2 - seconds
    
# if ride does carry over a day
if seconds > seconds2:
    limit = (23 * 360) + (59 * 60) + 59
    in_between = limit - sec1
    print(in_between + sec2)

In [125]:
end_times = k_final['end_time'].values

In [126]:
ride_durations = []

for i in range(len(start_times)):
    start_time = start_times[i].split(':')
    end_time = end_times[i].split(':')
    
    start_time = (int(start_time[0]) * 3600) + (int(start_time[1]) * 60) + (int(start_time[2]))
    end_time = (int(end_time[0]) * 3600) + (int(end_time[1]) * 60) + (int(end_time[2]))
    
    
    if start_time < end_time:
        ride_duration = end_time - start_time
    
    elif start_time > end_time:
        limit = (23 * 3600) + (59 * 60) + 59
        in_between = limit - start_time
        ride_duration = in_between + end_time

    elif start_time == end_time:
        ride_duration = 0
        
    ride_durations.append(ride_duration)
ride_durations

[451,
 223,
 171,
 529,
 332,
 289,
 289,
 297,
 295,
 203,
 196,
 380,
 529,
 465,
 1000,
 2147,
 1965,
 1927,
 2217,
 368,
 1801,
 658,
 1261,
 175,
 823,
 171,
 778,
 218,
 770,
 510,
 409,
 152,
 341,
 1795,
 482,
 324,
 666,
 586,
 686,
 270,
 276,
 295,
 574,
 479,
 424,
 913,
 2480,
 801,
 685,
 630,
 1967,
 650,
 374,
 397,
 318,
 441,
 1162,
 868,
 380,
 410,
 311,
 252,
 283,
 2088,
 1872,
 1831,
 992,
 291,
 340,
 666,
 258,
 287,
 162,
 266,
 490,
 1339,
 836,
 583,
 728,
 646,
 360,
 688,
 805,
 429,
 579,
 411,
 354,
 143,
 194,
 414,
 278,
 593,
 568,
 616,
 373,
 351,
 426,
 2222,
 1691,
 1300,
 642,
 532,
 1215,
 613,
 680,
 912,
 429,
 299,
 882,
 1033,
 491,
 431,
 613,
 506,
 868,
 783,
 897,
 498,
 509,
 1465,
 846,
 1038,
 943,
 627,
 301,
 405,
 533,
 410,
 226,
 361,
 439,
 659,
 795,
 1072,
 112,
 795,
 487,
 779,
 870,
 599,
 168,
 240,
 173,
 1163,
 1396,
 935,
 432,
 436,
 464,
 464,
 253,
 694,
 7473,
 178,
 1118,
 1502,
 216,
 589,
 135,
 467,
 352,
 1208,

In [127]:
max(ride_durations)

86398

In [128]:
min(ride_durations)

0

In [129]:
k_final['ride_durations'] = ride_durations

In [130]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [131]:
k_final[(k_final['start_date'] != k_final['end_date'])]

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
1031,8E7BA80EB79CB5C6,docked_bike,15:43:14,07:53:13,Lincoln Ave & Fullerton Ave,127.0,Clark St & Drummond Pl,220.0,41.925900,-87.649300,...,casual,2020-01-14,2020-01-24,15,131.0,Chicago,19,123.0,Chicago,58198
1539,233F1AEC4D8962BC,docked_bike,23:54:35,00:00:31,Clarendon Ave & Leland Ave,251.0,Sheridan Rd & Lawrence Ave,323.0,41.968000,-87.650000,...,member,2020-01-26,2020-01-27,15,385.0,Chicago,15,384.0,Chicago,355
1926,9F192C7AF22C3A0D,docked_bike,23:59:54,00:04:04,Kingsbury St & Erie St,74.0,Dearborn St & Erie St,110.0,41.893800,-87.641700,...,member,2020-01-06,2020-01-07,23,265.0,Chicago,23,45.0,Chicago,249
2383,785A67FB11A02848,docked_bike,08:19:49,10:25:54,Clinton St & Jackson Blvd,638.0,Clark St & Lake St,38.0,41.878100,-87.639800,...,member,2020-01-17,2020-01-18,0,0.0,Chicago,27,547.0,Chicago,7565
2826,869B26EEDEFDF774,docked_bike,18:53:30,09:42:51,McClurg Ct & Illinois St,26.0,Franklin St & Jackson Blvd,36.0,41.890400,-87.617500,...,casual,2020-01-19,2020-01-23,23,51.0,Chicago,27,19.0,Chicago,53360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11952187,A60EC1FF63DA47FA,docked_bike,22:12:15,01:10:43,Clark St & Armitage Ave,13146,Halsted St & Roscoe St,TA1309000025,41.918306,-87.636282,...,casual,2022-10-07,2022-10-08,0,0.0,Chicago,0,0.0,Chicago,10707
11952260,BB99CBDD5BFD7E3E,classic_bike,15:58:52,13:30:34,Dodge Ave & Church St,600,Dodge Ave & Church St,600,42.048308,-87.698224,...,casual,2022-10-14,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,77501
11952424,E95B1203FF0E45E2,classic_bike,23:42:49,00:15:55,Clarendon Ave & Junior Ter,13389,Sheridan Rd & Montrose Ave,TA1307000107,41.961004,-87.649603,...,member,2022-10-07,2022-10-08,0,0.0,Chicago,0,0.0,Chicago,1985
11952751,48AF8AD0E587279F,classic_bike,23:49:44,00:00:38,Clark St & Newport St,632,Clark St & Wellington Ave,TA1307000136,41.944540,-87.654678,...,casual,2022-10-29,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,653


In [132]:
k_final[k_final['ride_durations'] == 86398]

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
155321,9461DFF13D8BA8AD,docked_bike,10:09:43,10:09:42,HQ QR,675.0,HQ QR,675.0,41.889900,-87.680300,...,casual,2020-02-28,2020-02-28,0,0.0,Chicago,0,0.0,Chicago,86398
435052,6EB323BCC83A9D1D,docked_bike,15:46:12,15:46:11,Francisco Ave & Foster Ave,471.0,Damen Ave & Clybourn Ave,163.0,41.975600,-87.701400,...,member,2020-04-05,2020-04-05,15,613.0,Chicago,15,271.0,Chicago,86398
500685,BA9FA2547D002402,docked_bike,21:08:00,21:07:59,State St & Harrison St,5.0,Fairbanks Ct & Grand Ave,24.0,41.874100,-87.627700,...,casual,2020-04-12,2020-04-12,19,30.0,Chicago,15,262.0,Chicago,86398
542525,0419646053F0CD9F,docked_bike,16:10:12,16:10:11,Wolcott Ave & Polk St,342.0,Halsted St & 21st St,135.0,41.871300,-87.673700,...,member,2020-05-19,2020-05-19,15,284.0,Chicago,11,162.0,Chicago,86398
543402,A33B261758CD58C8,docked_bike,14:54:59,14:54:58,Western Ave & Walton St,374.0,Western Ave & Walton St,374.0,41.898400,-87.686600,...,casual,2020-05-25,2020-05-25,15,662.0,Chicago,15,662.0,Chicago,86398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6664132,7C194D321B885287,classic_bike,14:02:52,14:02:51,Throop St & Taylor St,13139,Throop St & Taylor St,13139,41.868968,-87.659141,...,member,2021-09-29,2021-09-29,0,0.0,Chicago,0,0.0,Chicago,86398
6735196,082A465904C062EF,classic_bike,11:15:17,11:15:16,Clark St & Elm St,TA1307000039,Clark St & Elm St,TA1307000039,41.902973,-87.631280,...,casual,2021-09-26,2021-09-26,0,0.0,Chicago,0,0.0,Chicago,86398
6953020,88BA6AC7326B179B,classic_bike,15:26:23,15:26:22,Kimbark Ave & 53rd St,TA1309000037,Kimbark Ave & 53rd St,TA1309000037,41.799568,-87.594747,...,member,2021-09-29,2021-09-29,0,0.0,Chicago,0,0.0,Chicago,86398
10857091,179F7874064251C8,classic_bike,16:35:21,16:35:20,Clark St & Montrose Ave,KA1503000022,Clark St & Montrose Ave,KA1503000022,41.961588,-87.666036,...,casual,2022-08-05,2022-08-05,0,0.0,Chicago,0,0.0,Chicago,86398


In [133]:
# ok, so we need to factor in days as well. If more than a day passes, aka in between start_date and end_date, I will delete
# the row. additionally, if they entries are on the same day, the entries will be deleted if the start time is after the
# end time.

# test_start_date = ['2022-09-05','2022-09-07','2022-09-16','2022-09-26','2022-09-26','2021-09-26','2022-010-26','2022-09-26']
# test_end_date = ['2022-09-05','2022-09-06','2022-09-22', '2022-09-27','2021-09-26','2022-09-26','2022-09-26','2022-10-26']

def leap_year_check(year):
    if year % 4 == 0:
        
        if year % 100 != 0:
            if year % 400 == 0:
                return True
        else:
            return False
        
        return True
    
    else:
        return False



# this function takes start and end date combinations and checks if they are valid. It returns the index position of all invalid
# date combinations so that they can be dropped.
def date_checker(test_start_dates=[],test_end_dates=[]):
    month_max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    
    date_invalid_check = []
    
    
    for i in range(len(test_start_dates)):
    
        start_d_test = test_start_dates[i].split('-')
        end_d_test = test_end_dates[i].split('-')
        
        
        # These are our years
        st_year = int(start_d_test[0])
        ed_year = int(end_d_test[0])
        
        # These are our months
        st_month = int(start_d_test[1])
        ed_month = int(end_d_test[1])

        # These are our days
        st_day = int(start_d_test[2])
        ed_day = int(end_d_test[2])
        
        
        # if the years of the dates are not the same
        if st_year != ed_year:
            #print('Year: ', i)
            date_invalid_check.append(i)

        # If st_month is after ed_month
        elif st_month > ed_month:
            #print(st_month == ed_month)
            #print('Start Month occurs after end month: ', i)
            date_invalid_check.append(i)
        
        # if st_month is more than 1 month before ed_month
        elif ed_month - st_month > 1:    
            #print(st_month == ed_month)
            #print('Start Month is too far from end month: ', i)
            date_invalid_check.append(i)
        
        elif ed_month == st_month + 1 and ed_day != 1:
            date_invalid_check.append(i)
        
        
        # If the ed_day is more than one day after the st_day, this works regardless of month
        elif ed_day - st_day > 1:
            #print('Day: ', i)
            date_invalid_check.append(i)
        
        # if ed_day - st_day is negative, we have to examine why
        elif ed_day - st_day < 0:
            #print('Day difference Negative: ', i)
            
            # if the month is the same, this chronological difference make no sense
            if ed_month == st_month:
                #print('Month is same: ', i)
                date_invalid_check.append(i)
            
            # if the st_month is the month before ed_month and it is the first day of ed_month
            elif ed_month == st_month + 1 and ed_day == 1:
                #print('Start Month end to Next month beginning: ', i)
                month_max = month_max_days[st_month - 1]

                # accounting for february leap years
                if leap_year_check(st_year) and st_month == 2:
                    #print('February: ', i)
                    month_max = 29
                
                # if our st_day is not the last day of the respective st_month, the date is invalid
                if st_day - month_max != 0:
                    #print('Start Month not ended yet, or day does not exist: ', i)
                    date_invalid_check.append(i)

    

    return date_invalid_check

#start_date = ['2015-07-30', '2014-06-18', '2020-02-29', '2014-03-31', '2015-07-10', '2015-12-31']
#end_date = ['2015-07-31', '2014-08-18', '2020-03-01', '2014-04-01', '2014-07-10', '2015-12-01']

#date_checker(start_date, end_date)


# This function will be used in the final stage. This was just testing and implementation for a subset of the data.

In [134]:
invalid_dates = date_checker(k_final['start_date'].values, k_final['end_date'].values)

In [135]:
len(invalid_dates)

2387

In [136]:
k_final.loc[invalid_dates]

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
1031,8E7BA80EB79CB5C6,docked_bike,15:43:14,07:53:13,Lincoln Ave & Fullerton Ave,127.0,Clark St & Drummond Pl,220.0,41.925900,-87.649300,...,casual,2020-01-14,2020-01-24,15,131.0,Chicago,19,123.0,Chicago,58198
2826,869B26EEDEFDF774,docked_bike,18:53:30,09:42:51,McClurg Ct & Illinois St,26.0,Franklin St & Jackson Blvd,36.0,41.890400,-87.617500,...,casual,2020-01-19,2020-01-23,23,51.0,Chicago,27,19.0,Chicago,53360
7533,651C959936BA3227,docked_bike,17:15:59,17:12:44,Rush St & Superior St,161.0,LaSalle St & Illinois St,181.0,41.895800,-87.625900,...,casual,2020-01-15,2020-01-24,23,530.0,Chicago,19,430.0,Chicago,86204
8033,4FC4437B29C9BF1B,docked_bike,17:05:36,13:14:10,Lakeview Ave & Fullerton Pkwy,313.0,Lakeview Ave & Fullerton Pkwy,313.0,41.925900,-87.639000,...,casual,2020-01-05,2020-01-07,19,147.0,Chicago,19,147.0,Chicago,72513
9052,78A27956CF4E52FC,docked_bike,16:11:57,06:02:05,Michigan Ave & Madison St,197.0,Clark St & Ida B Wells Dr,50.0,41.882100,-87.625100,...,casual,2020-01-17,2020-01-20,19,36.0,Chicago,27,9.0,Chicago,49807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10327777,17405F31D17313B2,docked_bike,13:07:51,11:10:12,Dusable Harbor,KA1503000064,Green St & Randolph St*,chargingstx3,41.886976,-87.612813,...,casual,2022-07-22,2022-07-29,0,0.0,Chicago,0,0.0,Chicago,79340
10328182,DC510E6F98003A94,docked_bike,18:37:11,00:32:38,DuSable Lake Shore Dr & Monroe St,13300,Green St & Randolph St*,chargingstx3,41.880958,-87.616743,...,casual,2022-07-04,2022-07-27,0,0.0,Chicago,0,0.0,Chicago,21326
10862902,4464B4342DAC2E9D,docked_bike,15:35:28,12:55:24,Montrose Harbor,TA1308000012,Green St & Randolph St*,chargingstx3,41.963982,-87.638181,...,casual,2022-08-26,2022-08-28,0,0.0,Chicago,0,0.0,Chicago,76795
10993136,C999C6BBBEC63568,docked_bike,11:04:19,19:52:40,Dusable Harbor,KA1503000064,Wilton Ave & Diversey Pkwy*,chargingstx0,41.886976,-87.612813,...,casual,2022-08-06,2022-08-09,0,0.0,Chicago,0,0.0,Chicago,31701


In [137]:
k_final = k_final.drop(invalid_dates).reset_index()

In [138]:
k_final

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
0,0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11950538,11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11950539,11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11950540,11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11950541,11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [139]:
max(k_final['ride_durations'].values)

86398

In [140]:
k_final[k_final['ride_durations'] == 86398]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
155250,155321,9461DFF13D8BA8AD,docked_bike,10:09:43,10:09:42,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-28,2020-02-28,0,0.0,Chicago,0,0.0,Chicago,86398
434826,435052,6EB323BCC83A9D1D,docked_bike,15:46:12,15:46:11,Francisco Ave & Foster Ave,471.0,Damen Ave & Clybourn Ave,163.0,41.975600,...,member,2020-04-05,2020-04-05,15,613.0,Chicago,15,271.0,Chicago,86398
500406,500685,BA9FA2547D002402,docked_bike,21:08:00,21:07:59,State St & Harrison St,5.0,Fairbanks Ct & Grand Ave,24.0,41.874100,...,casual,2020-04-12,2020-04-12,19,30.0,Chicago,15,262.0,Chicago,86398
542228,542525,0419646053F0CD9F,docked_bike,16:10:12,16:10:11,Wolcott Ave & Polk St,342.0,Halsted St & 21st St,135.0,41.871300,...,member,2020-05-19,2020-05-19,15,284.0,Chicago,11,162.0,Chicago,86398
543105,543402,A33B261758CD58C8,docked_bike,14:54:59,14:54:58,Western Ave & Walton St,374.0,Western Ave & Walton St,374.0,41.898400,...,casual,2020-05-25,2020-05-25,15,662.0,Chicago,15,662.0,Chicago,86398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6661997,6664132,7C194D321B885287,classic_bike,14:02:52,14:02:51,Throop St & Taylor St,13139,Throop St & Taylor St,13139,41.868968,...,member,2021-09-29,2021-09-29,0,0.0,Chicago,0,0.0,Chicago,86398
6733052,6735196,082A465904C062EF,classic_bike,11:15:17,11:15:16,Clark St & Elm St,TA1307000039,Clark St & Elm St,TA1307000039,41.902973,...,casual,2021-09-26,2021-09-26,0,0.0,Chicago,0,0.0,Chicago,86398
6950854,6953020,88BA6AC7326B179B,classic_bike,15:26:23,15:26:22,Kimbark Ave & 53rd St,TA1309000037,Kimbark Ave & 53rd St,TA1309000037,41.799568,...,member,2021-09-29,2021-09-29,0,0.0,Chicago,0,0.0,Chicago,86398
10854707,10857091,179F7874064251C8,classic_bike,16:35:21,16:35:20,Clark St & Montrose Ave,KA1503000022,Clark St & Montrose Ave,KA1503000022,41.961588,...,casual,2022-08-05,2022-08-05,0,0.0,Chicago,0,0.0,Chicago,86398


In [141]:
k_final

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
0,0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11950538,11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11950539,11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11950540,11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11950541,11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [142]:
max(k_final['ride_durations'])

86398

In [143]:
k_final[k_final['ride_durations'] > 80000]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
27860,27873,BFDAE7A25425F884,docked_bike,11:26:27,09:57:38,Clarendon Ave & Junior Ter,245.0,Sheridan Rd & Buena Ave,306.0,41.961000,...,casual,2020-01-27,2020-01-28,15,389.0,Chicago,15,374.0,Chicago,81070
34157,34172,5B1C8D097249361F,docked_bike,10:21:21,09:41:23,Austin Blvd & Madison St,544.0,Austin Blvd & Chicago Ave,524.0,41.880300,...,member,2020-01-31,2020-02-01,15,0.0,Chicago,11,0.0,Chicago,84001
35566,35581,0C6A6F38A75FC304,docked_bike,08:15:10,07:16:03,Milwaukee Ave & Grand Ave,84.0,Racine Ave & Randolph St,88.0,41.891600,...,member,2020-01-09,2020-01-10,19,33.0,Chicago,15,155.0,Chicago,82852
80224,80265,14BF3B05D43BC271,docked_bike,10:25:00,08:41:37,Buckingham Fountain,2.0,Buckingham Fountain,2.0,41.876500,...,casual,2020-01-19,2020-01-20,35,541.0,Chicago,35,541.0,Chicago,80196
80225,80266,2A1403BAA266FF75,docked_bike,10:24:12,08:47:24,Buckingham Fountain,2.0,Buckingham Fountain,2.0,41.876500,...,casual,2020-01-19,2020-01-20,35,541.0,Chicago,35,541.0,Chicago,80591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11756866,11759253,03A545D927524313,classic_bike,12:59:29,12:16:14,Michigan Ave & Lake St,TA1305000011,Shore Dr & 55th St,TA1308000009,41.886024,...,casual,2022-10-08,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,83804
11769382,11771769,B456C184D4DD27EC,classic_bike,16:59:57,15:21:30,Clark St & North Ave,13128,Racine Ave & Belmont Ave,TA1308000019,41.911974,...,casual,2022-10-14,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,80492
11810305,11812692,833CF24D8B29AD90,classic_bike,13:00:04,11:51:52,Lincoln Ave & Fullerton Ave,TA1309000058,Southport Ave & Roscoe St,13071,41.924161,...,member,2022-10-14,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,82307
11845870,11848257,0345839F42C05388,classic_bike,16:22:08,15:08:02,State St & 35th St,TA1307000129,900 W Harrison St,13028,41.831036,...,member,2022-10-08,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,81953


In [144]:
# I can see that Warehouse, Test, and some other entries that are invalid may be in the dataframe, so they must be deleted as

# so we can delete all rows with occurances of the phrase
k_final = k_final.drop(k_final[k_final['from_station_name'].str.lower().str.contains("base", regex=False, na=False) == True].index)
k_final = k_final.drop(k_final[k_final['to_station_name'].str.lower().str.contains("base", regex=False, na=False) == True].index)

In [145]:
k_final[k_final['ride_durations'] > 80000]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
27860,27873,BFDAE7A25425F884,docked_bike,11:26:27,09:57:38,Clarendon Ave & Junior Ter,245.0,Sheridan Rd & Buena Ave,306.0,41.961000,...,casual,2020-01-27,2020-01-28,15,389.0,Chicago,15,374.0,Chicago,81070
34157,34172,5B1C8D097249361F,docked_bike,10:21:21,09:41:23,Austin Blvd & Madison St,544.0,Austin Blvd & Chicago Ave,524.0,41.880300,...,member,2020-01-31,2020-02-01,15,0.0,Chicago,11,0.0,Chicago,84001
35566,35581,0C6A6F38A75FC304,docked_bike,08:15:10,07:16:03,Milwaukee Ave & Grand Ave,84.0,Racine Ave & Randolph St,88.0,41.891600,...,member,2020-01-09,2020-01-10,19,33.0,Chicago,15,155.0,Chicago,82852
80224,80265,14BF3B05D43BC271,docked_bike,10:25:00,08:41:37,Buckingham Fountain,2.0,Buckingham Fountain,2.0,41.876500,...,casual,2020-01-19,2020-01-20,35,541.0,Chicago,35,541.0,Chicago,80196
80225,80266,2A1403BAA266FF75,docked_bike,10:24:12,08:47:24,Buckingham Fountain,2.0,Buckingham Fountain,2.0,41.876500,...,casual,2020-01-19,2020-01-20,35,541.0,Chicago,35,541.0,Chicago,80591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11756866,11759253,03A545D927524313,classic_bike,12:59:29,12:16:14,Michigan Ave & Lake St,TA1305000011,Shore Dr & 55th St,TA1308000009,41.886024,...,casual,2022-10-08,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,83804
11769382,11771769,B456C184D4DD27EC,classic_bike,16:59:57,15:21:30,Clark St & North Ave,13128,Racine Ave & Belmont Ave,TA1308000019,41.911974,...,casual,2022-10-14,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,80492
11810305,11812692,833CF24D8B29AD90,classic_bike,13:00:04,11:51:52,Lincoln Ave & Fullerton Ave,TA1309000058,Southport Ave & Roscoe St,13071,41.924161,...,member,2022-10-14,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,82307
11845870,11848257,0345839F42C05388,classic_bike,16:22:08,15:08:02,State St & 35th St,TA1307000129,900 W Harrison St,13028,41.831036,...,member,2022-10-08,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,81953


In [146]:
k_final

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
0,0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11950538,11952925,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11950539,11952926,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11950540,11952927,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11950541,11952928,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [147]:
the_set = list(set(list(k_final['from_station_id'].values)))

for i in range(len(the_set)):
    print(the_set[i])

13150
2.0
3.0
4.0
5.0
6.0
7.0
309.0
9.0
524.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
62.0
294.0
KA1503000070
KA1504000101
66.0
67.0
68.0
69.0
KA1706005015
71.0
72.0
73.0
74.0
75.0
76.0
77.0
71.0
15621
80.0
81.0
20214
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0
100.0
101.0
102.0
103.0
KA1503000019
106.0
107.0
108.0
109.0
110.0
111.0
112.0
113.0
114.0
115.0
116.0
117.0
118.0
119.0
120.0
121.0
122.0
123.0
124.0
125.0
126.0
127.0
128.0
129.0
130.0
131.0
132.0
133.0
134.0
135.0
136.0
137.0
138.0
16940
140.0
141.0
142.0
143.0
144.0
145.0
146.0
147.0
148.0
149.0
150.0
20252.0
152.0
153.0
154.0
156.0
157.0
158.0
159.0
160.0
161.0
162.0
163.0
164.0
165.0
166.0
167.0
168.0
169.0
170.0
171.0
172.0
173.0
174.0
175.0
176.0
177.0
178.0
179.0
180.0
181.

In [148]:
# used this to investigate stand-out id entries and the like
k_final[k_final['from_station_id'] == 'DIVVY 001 - Warehouse test station']

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
8703569,8705946,599C6337A1F1635A,electric_bike,19:21:15,19:29:33,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Wood St & Taylor St (Temp),13285,41.851673,...,member,2022-05-07,2022-05-07,0,0.0,Chicago,0,0.0,Chicago,498
9197402,9199780,7A492BA13DC9849E,electric_bike,21:12:47,21:54:42,WestChi,DIVVY 001 - Warehouse test station,Mulligan Ave & Wellington Ave,351,41.934747,...,casual,2022-06-03,2022-06-03,0,0.0,Chicago,0,0.0,Chicago,2515
9657783,9660162,7325C8783518FE2F,electric_bike,19:58:33,20:00:46,WestChi,DIVVY 001 - Warehouse test station,Narragansett & Wrightwood,314,41.927299,...,member,2022-06-03,2022-06-03,0,0.0,Chicago,0,0.0,Chicago,133
9747690,9750069,226015676A82060F,electric_bike,15:08:17,15:25:14,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Milwaukee Ave & Fullerton Ave,428,41.924738,...,casual,2022-06-22,2022-06-22,0,0.0,Chicago,0,0.0,Chicago,1017
10203811,10206193,053289A38DEB1606,electric_bike,08:10:53,08:16:17,WestChi,DIVVY 001 - Warehouse test station,Pulaski Rd & 21st St,331.0,41.836699,...,member,2022-07-20,2022-07-20,0,0.0,Chicago,19,176.0,Chicago,324
10883127,10885512,0E2BDA1E247E7F0C,electric_bike,21:06:10,21:15:15,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Long & Irving Park,398,41.927669,...,casual,2022-08-13,2022-08-13,0,0.0,Chicago,0,0.0,Chicago,545
10968379,10970764,A80F66E86B532671,electric_bike,10:45:23,11:18:03,WestChi,DIVVY 001 - Warehouse test station,Normal Ave & Archer Ave,TA1308000014,41.932418,...,casual,2022-08-12,2022-08-12,0,0.0,Chicago,0,0.0,Chicago,1960


In [149]:
# id: '2059 Hastings Warehouse Station' delete rides with warehouse in the name, station name: 'NewHastings'

k_final = k_final.drop(k_final[k_final['from_station_id'] == '2059 Hastings Warehouse Station'].index)
k_final = k_final.drop(k_final[k_final['end_station_id'] == '2059 Hastings Warehouse Station'].index)

In [150]:
k_final.shape

(11948584, 23)

In [151]:
# delete these 'Hastings WH 2' , 'Throop/Hastings Mobile Station'

k_final = k_final.drop(k_final[k_final['from_station_id'] == 'Hastings WH 2'].index)
k_final = k_final.drop(k_final[k_final['end_station_id'] == 'Hastings WH 2'].index)

k_final = k_final.drop(k_final[k_final['from_station_id'] == 'Throop/Hastings Mobile Station'].index)
k_final = k_final.drop(k_final[k_final['end_station_id'] == 'Throop/Hastings Mobile Station'].index)





In [152]:
k_final[k_final['from_station_id'] == 'DIVVY 001 - Warehouse test station']

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
8703569,8705946,599C6337A1F1635A,electric_bike,19:21:15,19:29:33,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Wood St & Taylor St (Temp),13285,41.851673,...,member,2022-05-07,2022-05-07,0,0.0,Chicago,0,0.0,Chicago,498
9197402,9199780,7A492BA13DC9849E,electric_bike,21:12:47,21:54:42,WestChi,DIVVY 001 - Warehouse test station,Mulligan Ave & Wellington Ave,351,41.934747,...,casual,2022-06-03,2022-06-03,0,0.0,Chicago,0,0.0,Chicago,2515
9657783,9660162,7325C8783518FE2F,electric_bike,19:58:33,20:00:46,WestChi,DIVVY 001 - Warehouse test station,Narragansett & Wrightwood,314,41.927299,...,member,2022-06-03,2022-06-03,0,0.0,Chicago,0,0.0,Chicago,133
9747690,9750069,226015676A82060F,electric_bike,15:08:17,15:25:14,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Milwaukee Ave & Fullerton Ave,428,41.924738,...,casual,2022-06-22,2022-06-22,0,0.0,Chicago,0,0.0,Chicago,1017
10203811,10206193,053289A38DEB1606,electric_bike,08:10:53,08:16:17,WestChi,DIVVY 001 - Warehouse test station,Pulaski Rd & 21st St,331.0,41.836699,...,member,2022-07-20,2022-07-20,0,0.0,Chicago,19,176.0,Chicago,324
10883127,10885512,0E2BDA1E247E7F0C,electric_bike,21:06:10,21:15:15,WEST CHI-WATSON,DIVVY 001 - Warehouse test station,Long & Irving Park,398,41.927669,...,casual,2022-08-13,2022-08-13,0,0.0,Chicago,0,0.0,Chicago,545
10968379,10970764,A80F66E86B532671,electric_bike,10:45:23,11:18:03,WestChi,DIVVY 001 - Warehouse test station,Normal Ave & Archer Ave,TA1308000014,41.932418,...,casual,2022-08-12,2022-08-12,0,0.0,Chicago,0,0.0,Chicago,1960


In [153]:
# delete all 'DIVVY 001 - Warehouse test station' entries, either way

k_final = k_final.drop(k_final[k_final['from_station_id'] == 'DIVVY 001 - Warehouse test station'].index)
k_final = k_final.drop(k_final[k_final['end_station_id'] == 'DIVVY 001 - Warehouse test station'].index)

In [154]:
# 'Clark St & 9th St (AMLI)' id is messed up, it is 394 on some and SL-009 on others, needs replacement

k_final.loc[k_final['from_station_name'] == 'Clark St & 9th St (AMLI)', 'from_station_id'] = 'SL-009'
k_final.loc[k_final['to_station_name'] == 'Clark St & 9th St (AMLI)', 'end_station_id'] = 'SL-009'

# may do this check for each value to make sure everything is mapped to one id only
set(list(k_final.from_station_id[k_final['from_station_name'] == 'Clark St & 9th St (AMLI)'].values))

{'SL-009'}

In [155]:
k_final[k_final['from_station_name'] == 'Clark St & 9th St (AMLI)']

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
4566,4568,0900021397DF495A,docked_bike,06:16:43,06:22:32,Clark St & 9th St (AMLI),SL-009,Franklin St & Monroe St,287.0,41.870800,...,member,2020-01-07,2020-01-07,15,0.0,Chicago,23,57.0,Chicago,349
4570,4572,155033D91BC55315,docked_bike,17:50:28,18:07:05,Clark St & 9th St (AMLI),SL-009,Damen Ave & Madison St,215.0,41.870800,...,member,2020-01-15,2020-01-15,15,0.0,Chicago,15,134.0,Chicago,997
4571,4573,1DD9D4BB4CDC986C,docked_bike,06:20:58,06:26:41,Clark St & 9th St (AMLI),SL-009,Franklin St & Monroe St,287.0,41.870800,...,member,2020-01-27,2020-01-27,15,0.0,Chicago,23,57.0,Chicago,343
5391,5393,C39D5171DF851BE5,docked_bike,12:19:47,12:29:44,Clark St & 9th St (AMLI),SL-009,State St & Van Buren St,33.0,41.870800,...,casual,2020-01-31,2020-01-31,15,0.0,Chicago,27,3.0,Chicago,597
6566,6568,5D314128CC06B755,docked_bike,09:34:07,09:52:58,Clark St & 9th St (AMLI),SL-009,Paulina St & Flournoy St,383.0,41.870800,...,member,2020-01-06,2020-01-06,15,0.0,Chicago,23,669.0,Chicago,1131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11947393,11949780,273D9699DD69E496,classic_bike,12:58:25,13:26:34,Clark St & 9th St (AMLI),SL-009,Rush St & Hubbard St,KA1503000044,41.870816,...,casual,2022-10-08,2022-10-08,0,0.0,Chicago,0,0.0,Chicago,1689
11947411,11949798,35C45A0F35075F1E,classic_bike,20:36:12,20:56:12,Clark St & 9th St (AMLI),SL-009,Aberdeen St & Jackson Blvd,13157,41.870816,...,member,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1200
11947412,11949799,1BA94D176F4369C7,classic_bike,19:13:44,19:56:04,Clark St & 9th St (AMLI),SL-009,Aberdeen St & Jackson Blvd,13157,41.870816,...,member,2022-10-28,2022-10-28,0,0.0,Chicago,0,0.0,Chicago,2540
11947424,11949811,AD06BD156FB5884F,classic_bike,12:58:31,13:26:40,Clark St & 9th St (AMLI),SL-009,Rush St & Hubbard St,KA1503000044,41.870816,...,casual,2022-10-08,2022-10-08,0,0.0,Chicago,0,0.0,Chicago,1689


In [156]:
set(list(k_final.end_station_id[k_final['to_station_name'] == 'Clark St & 9th St (AMLI)'].values))

{'SL-009'}

In [157]:
# check validity of these
# 'Bissell St & Armitage Ave - Charging', 'chargingstx0'
# basically want to delete all rides that start and end with 'charging' or 'charge' in the station name/id, because these are
# not actual rides, they are charging sessions that are clocked with a ridetime

term1 = (k_final['from_station_name'].str.lower().str.contains("charg", regex=False, na=False) == True)
term2 = (k_final['to_station_name'].str.lower().str.contains("charg", regex=False, na=False) == True)
term3 = (k_final['ride_durations'] < 65)

k_final = k_final.drop(k_final[term1 & term2 & term3].index)

In [158]:
k_final[k_final['from_station_name'].str.lower().str.contains("charg", regex=False, na=False) == True]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
8135551,8137905,76C6AD16440C64D4,electric_bike,15:56:55,16:01:33,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,41.932303,...,casual,2022-02-09,2022-02-09,0,0.0,Chicago,0,0.0,Chicago,278
8143185,8145539,B17C6D61C8E1389D,electric_bike,11:56:32,12:01:23,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Wells St & Concord Ln,TA1308000050,41.918497,...,casual,2022-03-27,2022-03-27,0,0.0,Chicago,0,0.0,Chicago,291
8144652,8147006,540029D3110BB9A3,classic_bike,06:45:40,06:54:24,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Larrabee St & Webster Ave,13193,41.932418,...,member,2022-03-23,2022-03-23,0,0.0,Chicago,0,0.0,Chicago,524
8144744,8147098,54EB9671F6D538C6,classic_bike,13:47:17,13:56:02,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Belmont Ave,TA1307000134,41.932418,...,casual,2022-03-25,2022-03-25,0,0.0,Chicago,0,0.0,Chicago,525
8150747,8153102,535BD63805CC5965,electric_bike,17:39:52,17:54:17,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Greenview Ave & Fullerton Ave,TA1307000001,41.932272,...,member,2022-03-21,2022-03-21,0,0.0,Chicago,0,0.0,Chicago,865
8156855,8159210,B72A59643F35031E,classic_bike,11:51:41,12:00:29,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Lincoln Ave & Diversey Pkwy,TA1307000064,41.918018,...,member,2022-03-28,2022-03-28,0,0.0,Chicago,0,0.0,Chicago,528
8158466,8160821,A7DEC4A909D6B88C,classic_bike,18:55:50,19:03:53,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Sheffield Ave & Willow St,TA1306000032,41.932418,...,member,2022-03-27,2022-03-27,0,0.0,Chicago,0,0.0,Chicago,483
8163558,8165913,0B3C81597B6FE13A,electric_bike,11:18:21,11:25:31,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Clark St & Lincoln Ave,13179,41.918529,...,casual,2022-03-27,2022-03-27,0,0.0,Chicago,0,0.0,Chicago,430
8170911,8173267,AE20848CB28E3381,classic_bike,22:45:36,22:48:54,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Sheffield Ave & Fullerton Ave,TA1306000016,41.918018,...,member,2022-03-24,2022-03-24,0,0.0,Chicago,0,0.0,Chicago,198
8174025,8176382,5AEA7589177D53DD,classic_bike,19:12:22,19:17:42,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Sheffield Ave & Webster Ave,TA1309000033,41.932418,...,member,2022-03-22,2022-03-22,0,0.0,Chicago,0,0.0,Chicago,320


In [159]:
# I will apply the same rule to charging station ids
k_final[k_final['from_station_id'].str.lower().str.contains("charg", regex=False, na=False) == True]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
8135551,8137905,76C6AD16440C64D4,electric_bike,15:56:55,16:01:33,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,41.932303,...,casual,2022-02-09,2022-02-09,0,0.0,Chicago,0,0.0,Chicago,278
8143185,8145539,B17C6D61C8E1389D,electric_bike,11:56:32,12:01:23,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Wells St & Concord Ln,TA1308000050,41.918497,...,casual,2022-03-27,2022-03-27,0,0.0,Chicago,0,0.0,Chicago,291
8144652,8147006,540029D3110BB9A3,classic_bike,06:45:40,06:54:24,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Larrabee St & Webster Ave,13193,41.932418,...,member,2022-03-23,2022-03-23,0,0.0,Chicago,0,0.0,Chicago,524
8144744,8147098,54EB9671F6D538C6,classic_bike,13:47:17,13:56:02,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Belmont Ave,TA1307000134,41.932418,...,casual,2022-03-25,2022-03-25,0,0.0,Chicago,0,0.0,Chicago,525
8150747,8153102,535BD63805CC5965,electric_bike,17:39:52,17:54:17,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Greenview Ave & Fullerton Ave,TA1307000001,41.932272,...,member,2022-03-21,2022-03-21,0,0.0,Chicago,0,0.0,Chicago,865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11945929,11948316,C52E308D6F3E8FC4,classic_bike,16:38:57,16:43:36,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885492,...,casual,2022-10-06,2022-10-06,0,0.0,Chicago,0,0.0,Chicago,279
11945932,11948319,6428BBD05DE0B412,electric_bike,23:13:32,23:20:03,Morgan St & Lake St*,chargingstx4,Rush St & Hubbard St,KA1503000044,41.885452,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,391
11946093,11948480,13B866475AB890E2,electric_bike,11:10:14,11:16:31,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885410,...,member,2022-10-12,2022-10-12,0,0.0,Chicago,0,0.0,Chicago,377
11946094,11948481,02418DAFCCC881C4,electric_bike,15:42:59,15:47:38,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885509,...,casual,2022-10-05,2022-10-05,0,0.0,Chicago,0,0.0,Chicago,279


In [160]:
term_1 = (k_final['from_station_id'].str.lower().str.contains("charg", regex=False, na=False) == True)
term_2 = (k_final['end_station_id'].str.lower().str.contains("charg", regex=False, na=False) == True)
term_3 = (k_final['ride_durations'] < 65)

k_final = k_final.drop(k_final[term_1 & term_2 & term_3].index)

In [161]:
k_final[k_final['from_station_id'].str.lower().str.contains("charg", regex=False, na=False) == True]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
8135551,8137905,76C6AD16440C64D4,electric_bike,15:56:55,16:01:33,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,41.932303,...,casual,2022-02-09,2022-02-09,0,0.0,Chicago,0,0.0,Chicago,278
8143185,8145539,B17C6D61C8E1389D,electric_bike,11:56:32,12:01:23,Bissell St & Armitage Ave - Charging,Bissell St & Armitage Ave - Charging,Wells St & Concord Ln,TA1308000050,41.918497,...,casual,2022-03-27,2022-03-27,0,0.0,Chicago,0,0.0,Chicago,291
8144652,8147006,540029D3110BB9A3,classic_bike,06:45:40,06:54:24,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Larrabee St & Webster Ave,13193,41.932418,...,member,2022-03-23,2022-03-23,0,0.0,Chicago,0,0.0,Chicago,524
8144744,8147098,54EB9671F6D538C6,classic_bike,13:47:17,13:56:02,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Belmont Ave,TA1307000134,41.932418,...,casual,2022-03-25,2022-03-25,0,0.0,Chicago,0,0.0,Chicago,525
8150747,8153102,535BD63805CC5965,electric_bike,17:39:52,17:54:17,Wilton Ave & Diversey Pkwy - Charging,Wilton Ave & Diversey Pkwy - Charging,Greenview Ave & Fullerton Ave,TA1307000001,41.932272,...,member,2022-03-21,2022-03-21,0,0.0,Chicago,0,0.0,Chicago,865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11945929,11948316,C52E308D6F3E8FC4,classic_bike,16:38:57,16:43:36,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885492,...,casual,2022-10-06,2022-10-06,0,0.0,Chicago,0,0.0,Chicago,279
11945932,11948319,6428BBD05DE0B412,electric_bike,23:13:32,23:20:03,Morgan St & Lake St*,chargingstx4,Rush St & Hubbard St,KA1503000044,41.885452,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,391
11946093,11948480,13B866475AB890E2,electric_bike,11:10:14,11:16:31,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885410,...,member,2022-10-12,2022-10-12,0,0.0,Chicago,0,0.0,Chicago,377
11946094,11948481,02418DAFCCC881C4,electric_bike,15:42:59,15:47:38,Morgan St & Lake St*,chargingstx4,Aberdeen St & Jackson Blvd,13157,41.885509,...,casual,2022-10-05,2022-10-05,0,0.0,Chicago,0,0.0,Chicago,279


In [162]:
# deleted 2046 rows. Now I want to examine how many entries have 0 second entries for ride time, so I can delete those as well. 

In [163]:
k_final[k_final['ride_durations'] == 0]

Unnamed: 0,index,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
154090,154161,23EF1DCC9FCA40BA,docked_bike,11:34:40,11:34:40,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-28,2020-02-28,0,0.0,Chicago,0,0.0,Chicago,0
162810,162888,86163D9676BBBE62,docked_bike,14:41:16,14:41:16,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-26,2020-02-26,0,0.0,Chicago,0,0.0,Chicago,0
171299,171382,836931C569802344,docked_bike,09:56:47,09:56:47,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-27,2020-02-27,0,0.0,Chicago,0,0.0,Chicago,0
172304,172387,07CD3CBC94106B37,docked_bike,10:02:30,10:02:30,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-28,2020-02-28,0,0.0,Chicago,0,0.0,Chicago,0
183884,183972,83D849E5C5716FA3,docked_bike,10:39:01,10:39:01,HQ QR,675.0,HQ QR,675.0,41.889900,...,casual,2020-02-28,2020-02-28,0,0.0,Chicago,0,0.0,Chicago,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11887485,11889872,53C0EA485F89ED0E,classic_bike,15:40:18,15:40:18,Sheffield Ave & Kingsbury St,13154,Sheffield Ave & Kingsbury St,13154,41.910522,...,member,2022-10-07,2022-10-07,0,0.0,Chicago,0,0.0,Chicago,0
11925855,11928242,411F609A2E4C54F2,electric_bike,15:19:21,15:19:21,Kimball Ave & Belmont Ave,KA150400009X,Public Rack - Christiana Ave & Belmont Ave,499,41.939404,...,member,2022-10-07,2022-10-07,0,0.0,Chicago,0,0.0,Chicago,0
11929536,11931923,2896635694A8BC24,electric_bike,17:01:13,17:01:13,Sheffield Ave & Wrightwood Ave,TA1309000023,Sheffield Ave & Wrightwood Ave,TA1309000023,41.928549,...,member,2022-10-23,2022-10-23,0,0.0,Chicago,0,0.0,Chicago,0
11949457,11951844,088C9ADC2E7E3852,electric_bike,15:13:23,15:13:23,Clark St & Armitage Ave,13146,Clark St & Armitage Ave,13146,41.918334,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,0


In [164]:
# Now that I have observed HQ QR, I will delete all station names: 'HQ QR' and 675.0 in addition to all ride_durations = 0

k_final = k_final.drop(k_final[k_final['ride_durations'] == 0].index)

k_final = k_final.drop(k_final[k_final['from_station_id'] == '675.0'].index)
k_final = k_final.drop(k_final[k_final['end_station_id'] == '675.0'].index)


k_final = k_final.drop(k_final[k_final['from_station_name'] == 'HQ QR'].index)
k_final = k_final.drop(k_final[k_final['to_station_name'] == 'HQ QR'].index)

In [167]:
k_final.isnull().sum()

index                0
ride_id              0
rideable_type        0
start_time           0
end_time             0
from_station_name    0
from_station_id      0
to_station_name      0
end_station_id       0
start_lat            0
start_lng            0
end_lat              0
end_lng              0
usertype             0
start_date           0
end_date             0
from_dpcapacity      0
from_landmark        0
from_city            0
to_dpcapacity        0
to_landmark          0
to_city              0
ride_durations       0
dtype: int64

In [171]:
k_final = k_final.drop('index',axis=1)

In [172]:
k_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,from_dpcapacity,from_landmark,from_city,to_dpcapacity,to_landmark,to_city,ride_durations
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11950538,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11950539,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11950540,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11950541,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [173]:
k_final = k_final.rename(columns={'ride_id' : 'trip_id', 'from_dpcapacity' : 'end_dpcapacity', 'to_dpcapacity' : 'start_dpcapacity', 'ride_durations' : 'tripduration'})

In [174]:
k_final

Unnamed: 0,trip_id,rideable_type,start_time,end_time,from_station_name,from_station_id,to_station_name,end_station_id,start_lat,start_lng,...,usertype,start_date,end_date,end_dpcapacity,from_landmark,from_city,start_dpcapacity,to_landmark,to_city,tripduration
0,EACB19130B0CDA4A,docked_bike,20:06:59,20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.966500,-87.688400,...,member,2020-01-21,2020-01-21,15,112.0,Chicago,11,239.0,Chicago,451
1,8FED874C809DC021,docked_bike,14:22:39,14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.961600,-87.666000,...,member,2020-01-30,2020-01-30,19,316.0,Chicago,15,321.0,Chicago,223
2,789F3C21E472CA96,docked_bike,19:29:26,19:32:17,Broadway & Belmont Ave,296.0,Wilton Ave & Belmont Ave,117.0,41.940100,-87.645500,...,member,2020-01-09,2020-01-09,15,277.0,Chicago,23,69.0,Chicago,171
3,C9A388DAC6ABF313,docked_bike,16:17:07,16:25:56,Clark St & Randolph St,51.0,Fairbanks Ct & Grand Ave,24.0,41.884600,-87.631900,...,member,2020-01-06,2020-01-06,31,5.0,Chicago,15,262.0,Chicago,529
4,943BC3CBECCFD662,docked_bike,08:37:16,08:42:48,Clinton St & Lake St,66.0,Wells St & Hubbard St,212.0,41.885600,-87.641800,...,member,2020-01-30,2020-01-30,19,21.0,Chicago,19,264.0,Chicago,332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11950538,BC3BFA659C9AB6F1,classic_bike,01:41:29,01:57:16,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,947
11950539,ACD65450291CF95F,classic_bike,01:41:54,01:57:09,Clifton Ave & Armitage Ave,TA1307000163,Lincoln Ave & Roscoe St*,chargingstx5,41.918216,-87.656936,...,casual,2022-10-30,2022-10-30,0,0.0,Chicago,0,0.0,Chicago,915
11950540,4AAC03D1438E97CA,classic_bike,09:34:11,10:03:21,Sedgwick St & North Ave,TA1307000038,Wabash Ave & Grand Ave,TA1307000117,41.911386,-87.638677,...,casual,2022-10-15,2022-10-15,0,0.0,Chicago,0,0.0,Chicago,1750
11950541,8E6F3F29785E5D40,classic_bike,10:21:34,10:43:45,Sedgwick St & North Ave,TA1307000038,Damen Ave & Clybourn Ave,13271,41.911386,-87.638677,...,member,2022-10-09,2022-10-09,0,0.0,Chicago,0,0.0,Chicago,1331


In [175]:
k_final.to_csv('merged2020_2022.csv')