# Districts

## Imports

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon

## Load data

In [2]:
districtsData = '..\\data\\city_council_districts\\City_Council_Districts_Shapefile_-_Effective_2023_20241004.csv'
districts_df = pd.read_csv(districtsData)
districts_df.head()

Unnamed: 0,the_geom,DIST_ID,DISTRICTOR,DISTRICT_2,INDISTRICT,INDISTRI_1,ATLARGE,ATLARGETIT
0,MULTIPOLYGON (((-94.67532920837402 39.35500144...,1,First,1st,Heather Hall,Councilwoman,Kevin O'Neill,Councilman
1,MULTIPOLYGON (((-94.53610400019943 39.11611899...,3,Third,3rd,Melissa Robinson,Councilwoman,Brandon Ellington,Councilman
2,MULTIPOLYGON (((-94.59837341308588 39.04980659...,6,Sixth,6th,Kevin McManus,Councilman,Andrea Bough,Councilwoman
3,MULTIPOLYGON (((-94.57831573486322 39.19568061...,4,Fourth,4th,Eric Bunch,Councilman,Katheryn Shields,Councilwoman
4,MULTIPOLYGON (((-94.56217500007614 39.04358200...,5,Fifth,5th,Ryana Parks-Shaw,Councilwoman,"Lee Barnes, Jr.",Councilman


In [3]:
mergedData = '..\\data\\mergedData\\merged_df.csv'
df = pd.read_csv(mergedData)

  df = pd.read_csv(mergedData)


In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Report_No,Reported_Date,Reported_Time,From_Date,Address,City,DVFlag,Involvement,Firearm Used Flag,Location,Offense_Description
0,0,0,150015427,03/06/2015,0:02,03/06/2015,BROADWAY and WESTPORT RD,KANSAS CITY,U,VIC,N,POINT (-94.5767 38.9767),Misc Violation
1,1,3,150057463,09/08/2015,1:17,09/08/2015,PROSPECT AV and E TRUMAN RD,KANSAS CITY,U,SUS,N,POINT (-94.5516 39.0947),Auto Theft
2,2,4,150033873,05/19/2015,0:21,05/19/2015,VICTOR ST and WALROND AV,KANSAS CITY,U,VIC,N,POINT (-94.5461 39.0735),Possession/Sale/Dist
3,3,5,150061779,08/31/2015,10:28,08/31/2015,PASEO and E TRUMAN RD,KANSAS CITY,N,SUS,N,POINT (-89.0251 42.2167),Non Aggravated Assau
4,4,6,150087307,12/04/2015,2:57,12/04/2015,PASEO and E TRUMAN RD,KANSAS CITY,U,VIC,N,POINT (-94.5645 39.0952),Misc Violation


In [5]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987560 entries, 0 to 987559
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Report_No            987560 non-null  object
 1   Reported_Date        987560 non-null  object
 2   Reported_Time        987560 non-null  object
 3   From_Date            987560 non-null  object
 4   Address              987560 non-null  object
 5   City                 987560 non-null  object
 6   DVFlag               987560 non-null  object
 7   Involvement          987560 non-null  object
 8   Firearm Used Flag    987560 non-null  object
 9   Location             987560 non-null  object
 10  Offense_Description  987560 non-null  object
dtypes: object(11)
memory usage: 82.9+ MB


## Matching location to districts

In [7]:
# 1. Create geometry from polygon data (district boundaries)
districts_gdf = gpd.GeoDataFrame(districts_df, geometry=gpd.GeoSeries.from_wkt(districts_df["the_geom"]))

# Ensure the district GeoDataFrame has the correct CRS (assuming EPSG:4326, which is WGS 84)
districts_gdf.set_crs(epsg=4326, inplace=True)

# 2. Extract longitude and latitude from 'Location' ('POINT (Lon Lat)' format)
df['lon'] = df['Location'].apply(lambda x: float(x.split('(')[1].split()[0]))  # Extract longitude
df['lat'] = df['Location'].apply(lambda x: float(x.split('(')[1].split()[1].replace(')', '')))  # Extract latitude

# 3. Create a GeoDataFrame for the locations (using Longitude, Latitude)
locations_gdf = gpd.GeoDataFrame(df, 
                                 geometry=gpd.points_from_xy(df.lon, df.lat))

# Ensure the locations GeoDataFrame has the same CRS (assuming EPSG:4326)
locations_gdf.set_crs(epsg=4326, inplace=True)

# 4. Perform a spatial join to match crime locations with the district polygons
locations_gdf = gpd.sjoin(locations_gdf, districts_gdf, how='left', predicate='within')

# 5. Add 'dist_id' (district identifier) to the locations GeoDataFrame
locations_gdf['dist_id'] = locations_gdf['DIST_ID']

# Print df infos
print(locations_gdf.info())

# 6. Find missing dist_id rows (those that don't fall within any district)
missing_districts_gdf = locations_gdf[locations_gdf['dist_id'].isna()].copy()
print(len(missing_districts_gdf), "Could not be added to a specific district")

# 7. Reproject both GeoDataFrames to a projected CRS (for accurate distance calculations)
if not missing_districts_gdf.empty:
    print("Trying to match locations to nearest district..")
    # Reproject to a projected CRS, for example, UTM Zone 14N (EPSG:26914) -> EPSG:26914 (UTM Zone 14N for Kansas City)
    districts_projected = districts_gdf.to_crs(epsg=26914)
    missing_districts_projected = missing_districts_gdf.to_crs(epsg=26914)

    # Compute the nearest district for each missing location (using projected geometries)
    missing_districts_gdf.loc[:, 'nearest_geom'] = missing_districts_projected.geometry.apply(
        lambda x: districts_projected.distance(x).idxmin()
    )

    # Map the 'DIST_ID' of the nearest district to the missing rows
    missing_districts_gdf.loc[:, 'dist_id'] = missing_districts_gdf['nearest_geom'].apply(
        lambda idx: districts_gdf.loc[idx, 'DIST_ID']
    )

    # Replace the missing district rows in the original locations_gdf
    locations_gdf.loc[locations_gdf['dist_id'].isna(), 'dist_id'] = missing_districts_gdf['dist_id'].values

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 987906 entries, 0 to 987559
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   Report_No            987906 non-null  object  
 1   Reported_Date        987906 non-null  object  
 2   Reported_Time        987906 non-null  object  
 3   From_Date            987906 non-null  object  
 4   Address              987906 non-null  object  
 5   City                 987906 non-null  object  
 6   DVFlag               987906 non-null  object  
 7   Involvement          987906 non-null  object  
 8   Firearm Used Flag    987906 non-null  object  
 9   Location             987906 non-null  object  
 10  Offense_Description  987906 non-null  object  
 11  lon                  987906 non-null  float64 
 12  lat                  987906 non-null  float64 
 13  geometry             987906 non-null  geometry
 14  index_right          960477 non-null  float64 
 1

In [8]:
locations_gdf.head()

Unnamed: 0,Report_No,Reported_Date,Reported_Time,From_Date,Address,City,DVFlag,Involvement,Firearm Used Flag,Location,...,index_right,the_geom,DIST_ID,DISTRICTOR,DISTRICT_2,INDISTRICT,INDISTRI_1,ATLARGE,ATLARGETIT,dist_id
0,150015427,03/06/2015,0:02,03/06/2015,BROADWAY and WESTPORT RD,KANSAS CITY,U,VIC,N,POINT (-94.5767 38.9767),...,2.0,MULTIPOLYGON (((-94.59837341308588 39.04980659...,6.0,Sixth,6th,Kevin McManus,Councilman,Andrea Bough,Councilwoman,6.0
1,150057463,09/08/2015,1:17,09/08/2015,PROSPECT AV and E TRUMAN RD,KANSAS CITY,U,SUS,N,POINT (-94.5516 39.0947),...,1.0,MULTIPOLYGON (((-94.53610400019943 39.11611899...,3.0,Third,3rd,Melissa Robinson,Councilwoman,Brandon Ellington,Councilman,3.0
2,150033873,05/19/2015,0:21,05/19/2015,VICTOR ST and WALROND AV,KANSAS CITY,U,VIC,N,POINT (-94.5461 39.0735),...,1.0,MULTIPOLYGON (((-94.53610400019943 39.11611899...,3.0,Third,3rd,Melissa Robinson,Councilwoman,Brandon Ellington,Councilman,3.0
3,150061779,08/31/2015,10:28,08/31/2015,PASEO and E TRUMAN RD,KANSAS CITY,N,SUS,N,POINT (-89.0251 42.2167),...,,,,,,,,,,1.0
4,150087307,12/04/2015,2:57,12/04/2015,PASEO and E TRUMAN RD,KANSAS CITY,U,VIC,N,POINT (-94.5645 39.0952),...,1.0,MULTIPOLYGON (((-94.53610400019943 39.11611899...,3.0,Third,3rd,Melissa Robinson,Councilwoman,Brandon Ellington,Councilman,3.0


In [9]:
locations_gdf.columns

Index(['Report_No', 'Reported_Date', 'Reported_Time', 'From_Date', 'Address',
       'City', 'DVFlag', 'Involvement', 'Firearm Used Flag', 'Location',
       'Offense_Description', 'lon', 'lat', 'geometry', 'index_right',
       'the_geom', 'DIST_ID', 'DISTRICTOR', 'DISTRICT_2', 'INDISTRICT',
       'INDISTRI_1', 'ATLARGE', 'ATLARGETIT', 'dist_id'],
      dtype='object')

In [10]:
df = locations_gdf.drop(columns=['geometry', 'index_right', 'the_geom', 'DIST_ID', 'DISTRICTOR',
       'DISTRICT_2', 'INDISTRICT', 'INDISTRI_1', 'ATLARGE', 'ATLARGETIT'])

In [11]:
df.head()

Unnamed: 0,Report_No,Reported_Date,Reported_Time,From_Date,Address,City,DVFlag,Involvement,Firearm Used Flag,Location,Offense_Description,lon,lat,dist_id
0,150015427,03/06/2015,0:02,03/06/2015,BROADWAY and WESTPORT RD,KANSAS CITY,U,VIC,N,POINT (-94.5767 38.9767),Misc Violation,-94.5767,38.9767,6.0
1,150057463,09/08/2015,1:17,09/08/2015,PROSPECT AV and E TRUMAN RD,KANSAS CITY,U,SUS,N,POINT (-94.5516 39.0947),Auto Theft,-94.5516,39.0947,3.0
2,150033873,05/19/2015,0:21,05/19/2015,VICTOR ST and WALROND AV,KANSAS CITY,U,VIC,N,POINT (-94.5461 39.0735),Possession/Sale/Dist,-94.5461,39.0735,3.0
3,150061779,08/31/2015,10:28,08/31/2015,PASEO and E TRUMAN RD,KANSAS CITY,N,SUS,N,POINT (-89.0251 42.2167),Non Aggravated Assau,-89.0251,42.2167,1.0
4,150087307,12/04/2015,2:57,12/04/2015,PASEO and E TRUMAN RD,KANSAS CITY,U,VIC,N,POINT (-94.5645 39.0952),Misc Violation,-94.5645,39.0952,3.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 987906 entries, 0 to 987559
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Report_No            987906 non-null  object 
 1   Reported_Date        987906 non-null  object 
 2   Reported_Time        987906 non-null  object 
 3   From_Date            987906 non-null  object 
 4   Address              987906 non-null  object 
 5   City                 987906 non-null  object 
 6   DVFlag               987906 non-null  object 
 7   Involvement          987906 non-null  object 
 8   Firearm Used Flag    987906 non-null  object 
 9   Location             987906 non-null  object 
 10  Offense_Description  987906 non-null  object 
 11  lon                  987906 non-null  float64
 12  lat                  987906 non-null  float64
 13  dist_id              987906 non-null  float64
dtypes: float64(3), object(11)
memory usage: 113.1+ MB


## Export

In [13]:
df.to_csv("..\\data\\mergedData\\merged_df.csv")

---