## Imports

In [1]:
import pandas as pd
import numpy as np
from data_cleaning import clean_data
import folium
from geopy.distance import geodesic

## Usefull functions

### Generate maps

In [2]:
def map_stations_all_cords(data):
    bluebikes_start_stations_map_points = folium.Map(location=(42.361145, -71.057083), tiles="OpenStreetMap", zoom_start=12)

    # draw positions for the stations
    # whether we choose the coordinates from start or end station does not matter since they are the same
    #i want a red icon
    for key, value in data.items():
            color = ['red', 'green', 'blue', 'orange', 'white']
            i = 0
            for point in value:
                folium.Marker(location=point, icon=folium.Icon(color=color[i]), popup=(str(key) + "\n" + str(point))).add_to(bluebikes_start_stations_map_points)
                i += 1
        
    return bluebikes_start_stations_map_points

In [3]:
def map_stations_with_multiple_coordinates(data):
    bluebikes_start_stations_map_points = folium.Map(location=(42.361145, -71.057083), tiles="OpenStreetMap", zoom_start=12)

    # draw positions for the stations
    # whether we choose the coordinates from start or end station does not matter since they are the same
    #i want a red icon
    for key, value in data.items():
        if(len(value) > 1):
            color = ['red', 'green', 'blue', 'orange', 'white']
            i = 0
            for point in value:
                folium.Marker(location=point, icon=folium.Icon(color=color[i]), popup=(str(key) + "\n" + str(point))).add_to(bluebikes_start_stations_map_points)
                i += 1
        
    return bluebikes_start_stations_map_points

In [4]:
#Idea based on following chatGPT-Prompt:
# i have an dic with a key with lon and lat data attached. I want to show only data on a folium map 
#where the points in the array have a certain distance. How do i do that?

def far_away_map(pointDict, distance):
    far_away_dict = {}
    bosten = (42.361145, -71.057083)
    for key, value in pointDict.items():
        if(len(value) > 1):
            reference = value[0]
            for point in value[1:]:
                if(geodesic(point, reference).miles >distance):
                    if not key in far_away_dict:
                        far_away_dict[key] = value
        if(len(value) >= 1):    
            if(geodesic(value[0], bosten).miles > 10):
                if not key in far_away_dict:
                    far_away_dict[key] = value

    far_away_map_all = folium.Map(location=(42.361145, -71.057083), tiles="OpenStreetMap", zoom_start=10)

    for key, value in far_away_dict.items():
        color = ['red', 'green', 'blue', 'orange', 'yellow']
        i = 0
        for point in value:
            folium.Marker(location=point, icon=folium.Icon(color=color[i]), popup=(str(key) + "\n" + str(point))).add_to(far_away_map_all)
            i += 1
    return far_away_map_all

### Data functions

In [5]:
#takes about 10 minutes to run
def get_station_dict(data):
    all_station = {}
    end_dict ={}
    start_dict = {}
    for row in data.iterrows():
        if row[1].start_station_id in start_dict:
            if not (row[1].start_station_lat, row[1].start_station_lon) in start_dict[row[1].start_station_id]:
                start_dict[row[1].start_station_id].append((row[1].start_station_lat, row[1].start_station_lon))
        else:
            start_dict[row[1].start_station_id] = [(row[1].start_station_lat, row[1].start_station_lon)]
        if row[1].end_station_id in end_dict:
            if not (row[1].end_station_lat, row[1].end_station_lon) in end_dict[row[1].end_station_id]:
                end_dict[row[1].end_station_id].append((row[1].end_station_lat, row[1].end_station_lon))
        else:
            end_dict[row[1].end_station_id] = [(row[1].end_station_lat, row[1].end_station_lon)]

        if row[1].start_station_id in all_station:
            if not (row[1].start_station_lat, row[1].start_station_lon) in all_station[row[1].start_station_id]:
                all_station[row[1].start_station_id].append((row[1].start_station_lat, row[1].start_station_lon))
        else:
            all_station[row[1].start_station_id] = [(row[1].start_station_lat, row[1].start_station_lon)]
        if row[1].end_station_id in all_station:
            if not (row[1].end_station_lat, row[1].end_station_lon) in all_station[row[1].end_station_id]:
                all_station[row[1].end_station_id].append((row[1].end_station_lat, row[1].end_station_lon))
        else:
            all_station[row[1].end_station_id] = [(row[1].end_station_lat, row[1].end_station_lon)]    

    return start_dict, end_dict, all_station

## Load data

In [6]:
bluebikes = clean_data(r"C:\Users\STH_0\Desktop\Data\bluebikes_2017_2018")
bluebikes.head(5)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,weekday
0,2017-01-01 00:06:58,2017-01-01 00:12:49,67,139,644,6
1,2017-01-01 00:13:16,2017-01-01 00:28:07,36,10,230,6
2,2017-01-01 00:16:17,2017-01-01 00:44:10,36,9,980,6
3,2017-01-01 00:21:22,2017-01-01 00:33:50,46,19,1834,6
4,2017-01-01 00:30:06,2017-01-01 00:40:28,10,8,230,6


## Get positions of every station id

In [7]:
# get all stations with their coordinates
start_dict, end_dict, all_station = get_station_dict(bluebikes)

AttributeError: 'Series' object has no attribute 'start_station_lat'

In [None]:
start_dict.keys().__len__()

318

In [None]:
end_dict.keys().__len__()

319

In [None]:
i = 0
for key, value in start_dict.items():
    if len(value) > 1:
        #print(str(key) + ": "    + str(value))
        i += 1
i

72

In [None]:
i = 0
for key, value in end_dict.items():
    if len(value) > 1:
        #print(str(key) + ": "    + str(value))
        i += 1
i

68

In [None]:
#compare start and end station
i = 0
for key, value in start_dict.items():
    if key in end_dict:
        if value != end_dict[key]:
            print(str(key) + ": "    + str(value) + " != " + str(end_dict[key]))
            i += 1
i            

77: [(42.386844, -71.09812), (42.396386809455, -71.12226963043213)] != [(42.386844, -71.09812)]
217: [(42.38673178529929, -71.00621223455164), (42.386781, -71.006098)] != [(42.386781, -71.006098)]
135: [(42.344827, -71.028664), (42.353334, -71.09850591982195)] != [(42.344827, -71.028664)]
201: [(42.316873, -71.091955), (42.316902, -71.091946)] != [(42.316902, -71.091946)]
219: [(42.37430131617823, -71.03808671246952), (42.374335, -71.039685), (42.37430732260765, -71.03807777166365), (42.373312125824704, -71.0410200806291)] != [(42.374335, -71.039685), (42.37430732260765, -71.03807777166365), (42.373312125824704, -71.0410200806291)]
205: [(42.30791912678712, -71.06502056121826), (42.306348, -71.067149), (42.30785224238503, -71.06512248516083)] != [(42.306348, -71.067149), (42.30785224238503, -71.06512248516083)]
153: [(45.505086, -73.566921), (42.37530443828819, -71.08688293667001)] != [(45.505086, -73.566921)]
212: [(42.36884408289835, -71.03977829217911)] != [(42.3688385713652, -71.03

10

In [None]:
#compare start and end station
i = 0
for key, value in end_dict.items():
    if key in end_dict:
        if start_dict.get(key) != None:
            if value != start_dict[key]:
                print(str(key) + ": "    + str(value) + " != " + str(start_dict[key]))
                i += 1
i                

77: [(42.386844, -71.09812)] != [(42.386844, -71.09812), (42.396386809455, -71.12226963043213)]
212: [(42.3688385713652, -71.03976666949166), (42.36884408289835, -71.03977829217911)] != [(42.36884408289835, -71.03977829217911)]
135: [(42.344827, -71.028664)] != [(42.344827, -71.028664), (42.353334, -71.09850591982195)]
207: [(42.35480179069945, -71.15025043487549), (42.35484, -71.150226)] != [(42.35484, -71.150226)]
153: [(45.505086, -73.566921)] != [(45.505086, -73.566921), (42.37530443828819, -71.08688293667001)]
201: [(42.316902, -71.091946)] != [(42.316873, -71.091955), (42.316902, -71.091946)]
205: [(42.306348, -71.067149), (42.30785224238503, -71.06512248516083)] != [(42.30791912678712, -71.06502056121826), (42.306348, -71.067149), (42.30785224238503, -71.06512248516083)]
219: [(42.374335, -71.039685), (42.37430732260765, -71.03807777166365), (42.373312125824704, -71.0410200806291)] != [(42.37430131617823, -71.03808671246952), (42.374335, -71.039685), (42.37430732260765, -71.0380

10

## Map all stations

In [None]:
map_stations_all_cords(start_dict)

In [None]:
map_stations_all_cords(end_dict)

In [None]:
map_stations_all_cords(all_station)

Outlier on the map:

379: (-90.0,0.0), 158 (0.0, 0.0), 153 (45.505086, -73.566921), 308 (42.16722555541654, -70.90555783370291)

All stations with multiple positions

In [None]:
map_stations_with_multiple_coordinates(all_station)

New outlier found: 

229 (0.0, 0.0)

Search for all stations with coordinates in the outlier range

In [None]:
outlier_cords = [(0.0, 0.0), (-90.0,0.0), (45.505086, -73.566921), (42.16722555541654, -70.90555783370291)]

In [None]:
for key, value in start_dict.items():
    for point in value:
        if point in outlier_cords:
            print(str(key) + ": " + str(point))

153: (45.505086, -73.566921)
229: (0.0, 0.0)
230: (0.0, 0.0)
164: (0.0, 0.0)
158: (0.0, 0.0)
308: (42.16722555541654, -70.90555783370291)


Outliers found: 


153: (45.505086, -73.566921),
229: (0.0, 0.0),
230: (0.0, 0.0),
164: (0.0, 0.0),
158: (0.0, 0.0),
308: (42.16722555541654, -70.90555783370291)

In [None]:
far_away_map(all_station)

New outlier candidates: 135 (42.353334, -71.09850591982195), 224, 77

## Further data cleaning

In [None]:
outlier_cords

[(0.0, 0.0),
 (-90.0, 0.0),
 (45.505086, -73.566921),
 (42.16722555541654, -70.90555783370291)]

Remove clear outliers

In [None]:
bluebikes_clean = bluebikes.copy()
for point in outlier_cords:
    bluebikes_clean = bluebikes_clean[~((bluebikes_clean.start_station_lat == point[0]) & (bluebikes_clean.start_station_lon == point[1]))]
    bluebikes_clean = bluebikes_clean[~((bluebikes_clean.end_station_lat == point[0]) & (bluebikes_clean.end_station_lon == point[1]))]

In [None]:
start_dict, end_dict, all_station = get_station_dict(bluebikes_clean)

In [None]:
all_station.keys().__len__()

313

In [None]:
all_station[135]

[(42.344827, -71.028664)]

In [None]:
map_stations_all_cords(all_station)

In [None]:
map_stations_with_multiple_coordinates(all_station)

In [None]:
far_away_map(all_station, 0.25)

No more big outliers

In [None]:
far_away_map(all_station, 0.15)

Still some small differences left.

To fix this we will use the average of the coordinates of the stations with the same id.

In [None]:
all_station_average = {}
for key, value in all_station.items():
    lat = 0
    lon = 0
    for point in value:
        lat += point[0]
        lon += point[1]
    lat /= len(value)
    lon /= len(value)
    all_station_average[key] = ((lat, lon))

In [None]:
stations = pd.DataFrame.from_dict(all_station_average, orient='index', columns=['lat', 'lon'])

In [None]:
stations = pd.DataFrame({'station_id': list(all_station_average.keys()), 'lat': [x[0] for x in list(all_station_average.values())], 'lon': [x[1] for x in list(all_station_average.values())]})

In [None]:
stations.head(5)

Unnamed: 0,station_id,lat,lon
0,67,42.3581,-71.093198
1,139,42.36178,-71.1081
2,36,42.349792,-71.077338
3,10,42.350406,-71.108279
4,9,42.351187,-71.115852


In [None]:
#map stations with folium
bluebikes_map_points = folium.Map(location=(42.361145, -71.057083), tiles="OpenStreetMap", zoom_start=12)

for index, row in stations.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], popup=row['station_id']).add_to(bluebikes_map_points)
bluebikes_map_points    

In [271]:
stations.to_csv(r"C:\Users\STH_0\Desktop\Data\bluebikes_stations.csv", index=False)