In [44]:
import pandas as pd 
import geopandas as gpd
df = pd.read_csv('Taxi_Trips__2024-__20250505.csv')

print(df.head())

                                    Trip ID   
0  0000184e7cd53cee95af32eba49c44e4d20adcd8  \
1  000072ee076c9038868e239ca54185eb43959db0   
2  000074019d598c2b1d6e77fbae79e40b0461a2fc   
3  00007572c5f92e2ff067e6f838a5ad74e83665d3   
4  00007c3e7546e2c7d15168586943a9c22c3856cf   

                                             Taxi ID    Trip Start Timestamp   
0  f538e6b729d1aaad4230e9dcd9dc2fd9a168826ddadbd6...  01/19/2024 05:00:00 PM  \
1  e51e2c30caec952b40b8329a68b498e18ce8a1f40fa75c...  01/28/2024 02:30:00 PM   
2  aeb280ef3be3e27e081eb6e76027615b0d40925b84d3eb...  01/05/2024 09:00:00 AM   
3  7d21c2ca227db8f27dda96612bfe5520ab408fa9a462c8...  01/22/2024 08:45:00 AM   
4  8ef1056519939d511d24008e394f83e925d2539d668a00...  01/18/2024 07:15:00 PM   

       Trip End Timestamp  Trip Seconds  Trip Miles  Pickup Census Tract   
0  01/19/2024 06:00:00 PM        4051.0       17.12         1.703198e+10  \
1  01/28/2024 03:00:00 PM        1749.0       12.70                  NaN   
2  01/05

In [45]:
#check for duplicate rows
print(df['Trip ID'].duplicated().sum())
#directly discard not needed columns
df = df.drop(columns=['Trip ID', 'Fare', 'Tips', 'Tolls', 'Extras', 'Trip Total', 'Payment Type', 'Company'])

0


In [46]:
print(df.isnull().sum())

Taxi ID                             3
Trip Start Timestamp                0
Trip End Timestamp                 66
Trip Seconds                     1541
Trip Miles                         69
Pickup Census Tract           4524944
Dropoff Census Tract          4636162
Pickup Community Area          226777
Dropoff Community Area         742953
Pickup Centroid Latitude       222468
Pickup Centroid Longitude      222468
Pickup Centroid Location       222468
Dropoff Centroid Latitude      697653
Dropoff Centroid Longitude     697653
Dropoff Centroid  Location     697653
dtype: int64


Creating geojson from census tract dataset and then enriching dataset with coordinates/census tracts

In [47]:
tracts  = gpd.read_file("tl_2024_17_tract.geojson").to_crs(4326)

tract_centroids = (tracts.set_index("GEOID").geometry.centroid)

# checking for rows with coordinates and without tracts, to infer tracts
pickup_missing_tract = (df["Pickup Census Tract"].isna()
                 & df["Pickup Centroid Latitude"].notna()
                 & df["Pickup Centroid Longitude"].notna()
)


dropoff_missing_tract = (df["Dropoff Census Tract"].isna()
                 & df["Dropoff Centroid Latitude"].notna()
                 & df["Dropoff Centroid Longitude"].notna()
)

if pickup_missing_tract.any():
    pickup_points = gpd.GeoDataFrame(
        geometry=gpd.points_from_xy(
            df.loc[pickup_missing_tract, "Pickup Centroid Longitude"],
            df.loc[pickup_missing_tract, "Pickup Centroid Latitude"],
            crs=4326
        ),
        index=df.index[pickup_missing_tract]
    )

if dropoff_missing_tract.any():
    dropoff_points = gpd.GeoDataFrame(
        geometry=gpd.points_from_xy(
            df.loc[dropoff_missing_tract, "Dropoff Centroid Longitude"],
            df.loc[dropoff_missing_tract, "Dropoff Centroid Latitude"],
            crs=4326
        ),
        index=df.index[dropoff_missing_tract]
    )


pickup_joined = gpd.sjoin(pickup_points, tracts[["GEOID", "geometry"]], how="left", predicate="within")
df.loc[pickup_joined.index, "Pickup Census Tract"] = pickup_joined["GEOID"].values

dropoff_joined = gpd.sjoin(dropoff_points, tracts[["GEOID", "geometry"]], how="left", predicate="within")
df.loc[dropoff_joined.index, "Dropoff Census Tract"] = dropoff_joined["GEOID"].values

#checking for any rows with tracts and without coordinates and setting coordinates to center point of tract

pickup_missing_coords = (
    df["Pickup Census Tract"].notna()
    & df["Pickup Centroid Latitude"].isna()
    & df["Pickup Centroid Longitude"].isna()
)

dropoff_missing_coords = (
    df["Dropoff Census Tract"].notna()
    & df["Dropoff Centroid Latitude"].isna()
    & df["Dropoff Centroid Longitude"].isna()
)

if pickup_missing_coords.any():
    cent = tract_centroids.reindex(df.loc[pickup_missing_coords, "Pickup Census Tract"].astype(str))
    df.loc[pickup_missing_coords, "Pickup Centroid Latitude"] = cent.y.values
    df.loc[pickup_missing_coords, "Pickup Centroid Longitude"] = cent.x.values

if dropoff_missing_coords.any():
    cent = tract_centroids.reindex(df.loc[dropoff_missing_coords, "Dropoff Census Tract"].astype(str))
    df.loc[dropoff_missing_coords, "Dropoff Centroid Latitude"] = cent.y.values
    df.loc[dropoff_missing_coords, "Dropoff Centroid Longitude"] = cent.x.values


print(df.isnull().sum())



  tract_centroids = (tracts.set_index("GEOID").geometry.centroid)


Taxi ID                            3
Trip Start Timestamp               0
Trip End Timestamp                66
Trip Seconds                    1541
Trip Miles                        69
Pickup Census Tract           414554
Dropoff Census Tract          915663
Pickup Community Area         226777
Dropoff Community Area        742953
Pickup Centroid Latitude      222468
Pickup Centroid Longitude     222468
Pickup Centroid Location      222468
Dropoff Centroid Latitude     697653
Dropoff Centroid Longitude    697653
Dropoff Centroid  Location    697653
dtype: int64


In [48]:
df = df.dropna(subset=['Taxi ID', 'Trip End Timestamp', 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area', 'Dropoff Census Tract', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Pickup Centroid Location', 'Dropoff Centroid Latitude', 'Dropoff Centroid Longitude', 'Dropoff Centroid  Location'])
print(df.isnull().sum())

Taxi ID                       0
Trip Start Timestamp          0
Trip End Timestamp            0
Trip Seconds                  0
Trip Miles                    0
Pickup Census Tract           0
Dropoff Census Tract          0
Pickup Community Area         0
Dropoff Community Area        0
Pickup Centroid Latitude      0
Pickup Centroid Longitude     0
Pickup Centroid Location      0
Dropoff Centroid Latitude     0
Dropoff Centroid Longitude    0
Dropoff Centroid  Location    0
dtype: int64


In [49]:
print(df.dtypes)

Taxi ID                        object
Trip Start Timestamp           object
Trip End Timestamp             object
Trip Seconds                  float64
Trip Miles                    float64
Pickup Census Tract            object
Dropoff Census Tract           object
Pickup Community Area         float64
Dropoff Community Area        float64
Pickup Centroid Latitude      float64
Pickup Centroid Longitude     float64
Pickup Centroid Location       object
Dropoff Centroid Latitude     float64
Dropoff Centroid Longitude    float64
Dropoff Centroid  Location     object
dtype: object


In [50]:
# Trip Seconds
df_1 = df['Trip Seconds'].dropna()
print("Trip Seconds:", (df_1 % 1 == 0).all())

# Trip Miles
df_2 = df['Trip Miles'].dropna()
print("Trip Miles:", (df_2 % 1 == 0).all())

# Pickup Community Area
df_5 = df['Pickup Community Area'].dropna()
print("Pickup Community Area:", (df_5 % 1 == 0).all())

# Dropoff Community Area
df_6 = df['Dropoff Community Area'].dropna()
print("Dropoff Community Area:", (df_6 % 1 == 0).all())

# Pickup Centroid Latitude
df_7 = df['Pickup Centroid Latitude'].dropna()
print("Pickup Centroid Latitude:", (df_7 % 1 == 0).all())

# Pickup Centroid Longitude
df_8 = df['Pickup Centroid Longitude'].dropna()
print("Pickup Centroid Longitude:", (df_8 % 1 == 0).all())

# Dropoff Centroid Latitude
df_9 = df['Dropoff Centroid Latitude'].dropna()
print("Dropoff Centroid Latitude:", (df_9 % 1 == 0).all())

# Dropoff Centroid Longitude
df_10 = df['Dropoff Centroid Longitude'].dropna()
print("Dropoff Centroid Longitude:", (df_10 % 1 == 0).all())

Trip Seconds: True
Trip Miles: False
Pickup Community Area: True
Dropoff Community Area: True
Pickup Centroid Latitude: False
Pickup Centroid Longitude: False
Dropoff Centroid Latitude: False
Dropoff Centroid Longitude: False


In [51]:
df['Trip Seconds'] = df['Trip Seconds'].fillna(-1).astype('int32')
df['Pickup Community Area'] = df['Pickup Community Area'].fillna(0).astype('int64')
df['Dropoff Community Area'] = df['Dropoff Community Area'].fillna(0).astype('int64')
print(df.dtypes)

Taxi ID                        object
Trip Start Timestamp           object
Trip End Timestamp             object
Trip Seconds                    int32
Trip Miles                    float64
Pickup Census Tract            object
Dropoff Census Tract           object
Pickup Community Area           int64
Dropoff Community Area          int64
Pickup Centroid Latitude      float64
Pickup Centroid Longitude     float64
Pickup Centroid Location       object
Dropoff Centroid Latitude     float64
Dropoff Centroid Longitude    float64
Dropoff Centroid  Location     object
dtype: object


In [52]:
df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'], errors='coerce')
df['Trip End Timestamp'] = pd.to_datetime(df['Trip End Timestamp'], errors='coerce')

#because Times are being rounded, checking for rows with same start and end time
same_time_count = df[df['Trip Start Timestamp'] == df['Trip End Timestamp']]
print(same_time_count)

  df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'], errors='coerce')
  df['Trip End Timestamp'] = pd.to_datetime(df['Trip End Timestamp'], errors='coerce')


                                                   Taxi ID   
2        aeb280ef3be3e27e081eb6e76027615b0d40925b84d3eb...  \
5        833e49f9757b594a8a6765b93d1f7d8ad483e61c3a89d1...   
12       8307cf9433f0293eee99c6944aeab484521d9cd9b1fce5...   
18       c531f081cad817a366cbae2254ce7a3bb370394b3b0f13...   
37       2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba...   
...                                                    ...   
7917829  d511072131b602026bdb9faa5491d15c3af8d62dc00659...   
7917832  82a44a86e90ff3ed8e6e8b06bce2c86826f6a6500c2cfe...   
7917833  9475d35df519612aec86dbbfcb872633159f538fe7d90a...   
7917836  e2c35dff8bc754fc2b8e2b41774af45912aedd1d47bfa4...   
7917842  8cba51d644d2a96d874d93a5b0eb9cf26512383c2fdc5d...   

        Trip Start Timestamp  Trip End Timestamp  Trip Seconds  Trip Miles   
2        2024-01-05 09:00:00 2024-01-05 09:00:00           517        3.39  \
5        2024-01-09 17:00:00 2024-01-09 17:00:00            12        0.18   
12       2024-01-27 1

In [54]:
#checking for trips where the trip length is 0 and the trip distance is 0
print(len(df))
df = df[~((df["Trip Seconds"] == 0) & (df["Trip Miles"] == 0))]
print(len(df))


6730341
6647737


In [55]:
print(df['Pickup Census Tract'].nunique())

572


In [56]:
df.to_csv('testing_no_null.csv', index=False)

In [57]:
#import geopandas as gpd



#tracts = gpd.read_file('tl_2024_17_tract/tl_2024_17_tract.shp')

#print(tracts)



#all_tracts = pd.unique(df[['Pickup Census Tract', 'Dropoff Census Tract']].values.ravel())

#filtered_tracts = tracts[tracts['GEOID'].isin(all_tracts)]
#print(filtered_tracts.shape)
#filtered_tracts.to_file('chicago_census_tracts.geojson', driver='GEOJSon')





