In [1]:
import requests
import matplotlib
import geopandas as gpd

In [2]:
BASE_URL = "https://s3.amazonaws.com/tripdata/"
month = "202412-citibike-tripdata.zip"
latest_data = requests.get(BASE_URL + month)

In [37]:
from decimal import Decimal
from io import BytesIO
import pandas as pd
from zipfile import ZipFile

# Read in a zipped CSV HTML response

origin_table_do_not_modify = pd.DataFrame()
zip_data = BytesIO(latest_data.content)
csv_filename = None

# Open the zip file
with ZipFile(zip_data) as zip_file:
    for name in zip_file.namelist():
        if name.endswith(".csv"):
            csv_filename = name
            with zip_file.open(name) as csv_file:
                origin_table_do_not_modify = pd.read_csv(
                    csv_file,
                    dtype={"start_station_id": str, "end_station_id": str},
                    converters={"start_lat": Decimal}
                )
                break


### Coordinate Inconsistencies

There are a small fraction of stations which have variable `start_lat` values for a given `start_station_id`. In this sample, it is only 5 stations, which is 0.33% of stations.

In [34]:
# Remove rows with missing station IDs
non_null_station_ids = origin_table_do_not_modify.dropna(subset=['start_station_id', 'end_station_id'])

In [16]:
#  Get the number of unique `start_lat` values for each `start_station_id`
unique_start_lat_counts = non_null_station_ids.groupby('start_station_id')['start_lat'].nunique().sort_values(ascending=False)
print("How many stations have more than one unique `start_lat` value? And how many values do they have?")
print(unique_start_lat_counts)
print("\n\n\n")
# There are only 7 stations with more than one unique `start_lat` value. This is 0.33% of stations.
print("What percentage of stations have more than one unique `start_lat` value?")
print(100 * unique_start_lat_counts[unique_start_lat_counts > 1].shape[0] / unique_start_lat_counts.shape[0])


How many stations have more than one unique `start_lat` value? And how many values do they have?
start_station_id
5788.13         5
6569.09         4
5230.02         2
4001.09         2
5204.05         2
               ..
5225.04         1
5225.02         1
5219.05         1
5217.03         1
Shop Morgan     1
Name: start_lat, Length: 2138, dtype: int64




What percentage of stations have more than one unique `start_lat` value?
0.3274087932647334


In [27]:

# One of the stations, 5788.13 has 5 unique `start_lat` values. Four of them have only one trip, the other 2965 have the same `start_lat`
non_null_station_ids[non_null_station_ids.start_station_id == '5788.13'].start_lat.value_counts()
stations_w_multiple_start_lats = unique_start_lat_counts[unique_start_lat_counts > 1]

print("What is the breakdown of these variable start_lat values?")
for station_id in stations_w_multiple_start_lats.index:
    print(f"Station {station_id} has {stations_w_multiple_start_lats[station_id]} unique `start_lat` values")
    print(non_null_station_ids[non_null_station_ids.start_station_id == station_id].start_lat.value_counts())
    print("\n\n")

What is the breakdown of these variable start_lat values?
Station 5788.13 has 5 unique `start_lat` values
start_lat
40.73020660529954    2965
40.730251551            1
40.730248094            1
40.730335832            1
40.730383039            1
Name: count, dtype: int64



Station 6569.09 has 4 unique `start_lat` values
start_lat
40.75414519263519    681
40.754180074           1
40.754176259           1
40.754261613           1
Name: count, dtype: int64



Station 5230.02 has 2 unique `start_lat` values
start_lat
40.71332151208186    649
40.713014007           1
Name: count, dtype: int64



Station 4001.09 has 2 unique `start_lat` values
start_lat
40.6737236      596
40.673810124      1
Name: count, dtype: int64



Station 5204.05 has 2 unique `start_lat` values
start_lat
40.71285887     1654
40.712882996       1
Name: count, dtype: int64



Station 6611.07 has 2 unique `start_lat` values
start_lat
40.75676475086503    863
40.757083774           1
Name: count, dtype: int64



Station 

In [40]:
print(non_null_station_ids.start_lat)
non_null_station_ids.start_lat.dtype

0                 40.823061
3         40.73153937464073
4               40.76030096
5         40.73153937464073
6             40.6896218879
                ...        
999995           40.7459849
999996             40.72028
999997           40.8025566
999998          40.72405549
999999            40.843079
Name: start_lat, Length: 995161, dtype: object


dtype('O')

### Do these inconsistent values appear elsewhere?

I wanted to find out if perhaps these random latitude values are being assigned from another dock. That doesn't appear to be the case. Searching for one of the other values only brings up the result from the original dock

In [45]:

input_decimal = Decimal('40.754180074')

# Compare the decimal type of start_lat to the input decimal
is_equal = origin_table_do_not_modify['start_lat'].apply(lambda x: Decimal(x) == input_decimal)

# Print the result
print(is_equal)

df = origin_table_do_not_modify[is_equal]
df

0         False
1         False
2         False
3         False
4         False
          ...  
999995    False
999996    False
999997    False
999998    False
999999    False
Name: start_lat, Length: 1000000, dtype: bool


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
387588,7EC976A47331555A,electric_bike,2024-12-11 17:12:54.480,2024-12-11 17:22:06.101,W 35 St & 9 Ave,6569.09,Lexington Ave & E 26 St,6089.08,40.754180074,-73.996227,40.741459,-73.983293,member


### Do dock names stay consistent?
Yes, at least within a single file.

In [51]:
origin_table_do_not_modify.groupby('start_station_id')['start_station_name'].nunique().sort_values(ascending=False)

start_station_id
2733.03         1
6824.07         1
6854.05         1
6850.01         1
6848.04         1
               ..
5225.04         1
5225.02         1
5219.05         1
5217.03         1
Shop Morgan     1
Name: start_station_name, Length: 2138, dtype: int64

In [54]:
origin_table_do_not_modify['start_station_id'].nunique()

2138