In [76]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
df = pd.read_csv("chicago crime - 2001~current.csv", encoding='utf-8', low_memory=False)

In [77]:
df.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     14099
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    614826
Community Area          613453
FBI Code                     0
X Coordinate             91682
Y Coordinate             91682
Year                         0
Updated On                   0
Latitude                 91682
Longitude                91682
Location                 91682
dtype: int64

In [78]:
df = df.dropna(subset=['X Coordinate','Y Coordinate','Latitude', 'Longitude', 'Location'])

In [11]:
df.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description      9097
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    605573
Community Area          604251
FBI Code                     0
X Coordinate                 0
Y Coordinate                 0
Year                         0
Updated On                   0
Latitude                     0
Longitude                    0
Location                     0
dtype: int64

In [7]:
df['Date']

3          09/06/2023 05:00:00 PM
4          09/06/2023 11:00:00 AM
5          05/21/2019 08:20:00 AM
6          07/07/2021 10:30:00 AM
7          06/14/2022 02:47:00 PM
                    ...          
8280637    03/16/2025 12:36:00 PM
8280638    03/16/2025 11:00:00 PM
8280639    03/16/2025 08:44:00 PM
8280640    03/16/2025 03:45:00 PM
8280641    03/16/2025 09:44:00 AM
Name: Date, Length: 8188960, dtype: object

In [12]:
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')

In [13]:
# 시기 기준으로 나누기
df_2001_2015 = df[df['Date'].dt.year <= 2015]
df_2016_2022 = df[(df['Date'].dt.year > 2015) & (df['Date'].dt.year < 2023)]
df_2023 = df[df['Date'].dt.year >= 2023]

# 파일로 저장 
df_2001_2015.to_csv("crime_2001_2015.csv", index=False)
df_2016_2022.to_csv("crime_2016_2022.csv", index=False)
df_2023.to_csv("crime_2023.csv", index=False)

In [14]:
def load_ward_gdf(csv_path, ward_col='WARD'):
    df = pd.read_csv(csv_path)
    df = df[df[ward_col].notna()]              # NA 제거
    df = df[df[ward_col] != 'OUT']             # 'OUT' 제거
    df[ward_col] = df[ward_col].astype(float)  # float형 변환
    df['geometry'] = df['the_geom'].apply(wkt.loads)
    return gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

In [15]:
df = pd.read_csv("crime_2001_2015.csv")
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude'])
crime_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
ward1 = load_ward_gdf("Boundaries - Wards (2003-2015).csv")
joined = gpd.sjoin(crime_gdf, ward_gdf[['WARD', 'geometry']], how='left', predicate='within')
joined['Ward'] = joined['Ward'].fillna(joined['WARD'])  
joined.drop(columns=['WARD', 'index_right'], inplace=True)
joined.to_csv("crime_2001_2015_filled.csv", index=False)

In [16]:
df2 = pd.read_csv("crime_2016_2022.csv")
df2['geometry'] = gpd.points_from_xy(df2['Longitude'], df2['Latitude'])
gdf2 = gpd.GeoDataFrame(df2, geometry='geometry', crs='EPSG:4326')
ward2 = load_ward_gdf("Boundaries - Wards (2015-2023).csv")
joined2 = gpd.sjoin(gdf2, ward2[['WARD', 'geometry']], how='left', predicate='within')
joined2['Ward'] = joined2['Ward'].fillna(joined2['WARD'])
joined2.drop(columns=['WARD', 'index_right'], inplace=True)
joined2.to_csv("crime_2016_2022_filled.csv", index=False)

In [17]:
df3 = pd.read_csv("crime_2023.csv")
df3['geometry'] = gpd.points_from_xy(df3['Longitude'], df3['Latitude'])
gdf3 = gpd.GeoDataFrame(df3, geometry='geometry', crs='EPSG:4326')
ward3 = load_ward_gdf("Boundaries - Wards (2023-).csv", ward_col='ward_id')  
joined3 = gpd.sjoin(gdf3, ward3[['ward_id', 'geometry']], how='left', predicate='within')
joined3['Ward'] = joined3['Ward'].fillna(joined3['ward_id'])
joined3.drop(columns=['ward_id', 'index_right'], inplace=True)
joined3.to_csv("crime_2023_filled.csv", index=False)

In [18]:
# 1. 시기별 파일 불러오기
df1 = pd.read_csv("crime_2001_2015_filled.csv")
df2 = pd.read_csv("crime_2016_2022_filled.csv")
df3 = pd.read_csv("crime_2023_filled.csv")
# 2. 하나로 합치기
df_all = pd.concat([df1, df2, df3], ignore_index=True)
# 3. 저장
df_all.to_csv("crime_all_filled.csv", index=False)

In [39]:
# 최종 병합된 파일 로드
df = pd.read_csv("crime_all_filled.csv")
# 10개 샘플만 보기
print(df[['Date', 'Primary Type', 'Latitude', 'Longitude', 'Ward']].sample(10))

                        Date         Primary Type   Latitude  Longitude  Ward
2225729  2005-10-21 12:20:00            NARCOTICS  41.778563 -87.684844  15.0
5912555  2015-09-10 14:20:00                THEFT  41.843474 -87.724529  22.0
6156869  2016-10-04 11:30:00   DECEPTIVE PRACTICE  41.767372 -87.690653  18.0
540481   2001-12-20 19:50:00              ASSAULT  41.728817 -87.569583   7.0
622369   2002-03-07 13:20:00                THEFT  41.856044 -87.662268  25.0
4176776  2010-09-02 15:15:00  MOTOR VEHICLE THEFT  41.796278 -87.702253  14.0
997165   2003-02-11 05:00:00             BURGLARY  41.886906 -87.716071  28.0
2274281  2005-12-17 15:00:00             BURGLARY  41.816090 -87.616779   3.0
474499   2001-11-27 18:00:00                THEFT  41.936415 -87.756757  31.0
5446151  2014-05-09 01:00:00              ROBBERY  41.943948 -87.649387  44.0


In [21]:
print(df.columns.tolist())

['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location', 'geometry']


In [28]:
df.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description      9097
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                       616
Community Area          604339
FBI Code                     0
X Coordinate                 0
Y Coordinate                 0
Year                         0
Updated On                   0
Latitude                     0
Longitude                    0
Location                     0
geometry                     0
dtype: int64

In [29]:
df.shape

(8190154, 23)

In [23]:
print(df['Community Area'].unique())

[49. 53. 11. 71. 23.  2. 43. 65. 46. 44. 42.  1. 68. 19.  3. 58. 66. 27.
 74. 61. 75. 69. 21. 56. 38. 54. 51. 48. 24. 52. 45. nan 29. 32. 26. 30.
 67. 70. 57. 50. 12. 35. 41. 55.  8. 22. 25. 28. 40. 72. 73.  6. 36. 47.
 39. 77.  7. 15. 34. 16. 14. 31. 63.  9. 62. 17. 59. 10. 20. 60. 64. 76.
 13.  5.  4. 18. 33. 37.  0.]


In [40]:
(df['Community Area'] == 0).sum()

69

In [41]:
df['Community Area'] = df['Community Area'].replace(0, pd.NA)

In [42]:
area_df = pd.read_csv("Boundaries - Community Areas (current).csv")
area_df = area_df[area_df['AREA_NUMBE'].notna()]
area_df['AREA_NUMBE'] = area_df['AREA_NUMBE'].astype(float)
area_df['geometry'] = area_df['the_geom'].apply(wkt.loads)
area_gdf = gpd.GeoDataFrame(area_df, geometry='geometry', crs='EPSG:4326')
area_gdf = area_gdf[['AREA_NUMBE', 'geometry']].rename(columns={'AREA_NUMBE': 'Area_Num'})
df = df.reset_index()
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude'])
crime_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
joined = gpd.sjoin(crime_gdf, area_gdf, how='left', predicate='within')
joined['Community Area'] = joined['Community Area'].fillna(joined['Area_Num']).astype(float)
joined.drop(columns=['Area_Num', 'index_right'], inplace=True)
joined.to_csv("crime_final2.csv", index=False)

  joined['Community Area'] = joined['Community Area'].fillna(joined['Area_Num']).astype(float)


In [43]:
df = pd.read_csv("crime_final2.csv")

# Community Area 결측치 확인
print("결측치 수:", df['Community Area'].isna().sum())

# 어떤 값들이 있는지 유니크하게 확인
print("유니크 값:", sorted(df['Community Area'].dropna().unique()))

결측치 수: 497
유니크 값: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0]


In [44]:
df.isna().sum()

index                      0
ID                         0
Case Number                0
Date                       0
Block                      0
IUCR                       0
Primary Type               0
Description                0
Location Description    9097
Arrest                     0
Domestic                   0
Beat                       0
District                  47
Ward                     616
Community Area           497
FBI Code                   0
X Coordinate               0
Y Coordinate               0
Year                       0
Updated On                 0
Latitude                   0
Longitude                  0
Location                   0
geometry                   0
dtype: int64

In [45]:
df = df.dropna(subset=['Ward', 'Community Area', 'District'])

In [46]:
df.isna().sum()

index                      0
ID                         0
Case Number                0
Date                       0
Block                      0
IUCR                       0
Primary Type               0
Description                0
Location Description    9097
Arrest                     0
Domestic                   0
Beat                       0
District                   0
Ward                       0
Community Area             0
FBI Code                   0
X Coordinate               0
Y Coordinate               0
Year                       0
Updated On                 0
Latitude                   0
Longitude                  0
Location                   0
geometry                   0
dtype: int64

In [47]:
# 매핑 딕셔너리
community_area_map = {
    1: 'Rogers Park', 2: 'West Ridge', 3: 'Uptown', 4: 'Lincoln Square',
    5: 'North Center', 6: 'Lake View', 7: 'Lincoln Park', 8: 'Near North Side',
    9: 'Edison Park', 10: 'Norwood Park', 11: 'Jefferson Park', 12: 'Forest Glen',
    13: 'North Park', 14: 'Albany Park', 15: 'Portage Park', 16: 'Irving Park',
    17: 'Dunning', 18: 'Montclare', 19: 'Belmont Cragin', 20: 'Hermosa',
    21: 'Avondale', 22: 'Logan Square', 23: 'Humboldt Park', 24: 'West Town',
    25: 'Austin', 26: 'West Garfield Park', 27: 'East Garfield Park',
    28: 'Near West Side', 29: 'North Lawndale', 30: 'South Lawndale',
    31: 'Lower West Side', 32: 'Loop', 33: 'Near South Side', 34: 'Armour Square',
    35: 'Douglas', 36: 'Oakland', 37: 'Fuller Park', 38: 'Grand Boulevard',
    39: 'Kenwood', 40: 'Washington Park', 41: 'Hyde Park', 42: 'Woodlawn',
    43: 'South Shore', 44: 'Chatham', 45: 'Avalon Park', 46: 'South Chicago',
    47: 'Burnside', 48: 'Calumet Heights', 49: 'Roseland', 50: 'Pullman',
    51: 'South Deering', 52: 'East Side', 53: 'West Pullman', 54: 'Riverdale',
    55: 'Hegewisch', 56: 'Garfield Ridge', 57: 'Archer Heights',
    58: 'Brighton Park', 59: 'McKinley Park', 60: 'Bridgeport',
    61: 'New City', 62: 'West Elsdon', 63: 'Gage Park', 64: 'Clearing',
    65: 'West Lawn', 66: 'Chicago Lawn', 67: 'West Englewood',
    68: 'Englewood', 69: 'Greater Grand Crossing', 70: 'Ashburn',
    71: 'Auburn Gresham', 72: 'Beverly', 73: 'Washington Heights',
    74: 'Mount Greenwood', 75: 'Morgan Park', 76: 'O\'Hare', 77: 'Edgewater'
}

# 매핑 적용
df['Community Area Name'] = df['Community Area'].map(lambda x: community_area_map.get(int(x)) if pd.notna(x) else None)


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8189480 entries, 0 to 8190153
Data columns (total 25 columns):
 #   Column                Dtype  
---  ------                -----  
 0   index                 int64  
 1   ID                    int64  
 2   Case Number           object 
 3   Date                  object 
 4   Block                 object 
 5   IUCR                  object 
 6   Primary Type          object 
 7   Description           object 
 8   Location Description  object 
 9   Arrest                bool   
 10  Domestic              bool   
 11  Beat                  int64  
 12  District              float64
 13  Ward                  float64
 14  Community Area        float64
 15  FBI Code              object 
 16  X Coordinate          float64
 17  Y Coordinate          float64
 18  Year                  int64  
 19  Updated On            object 
 20  Latitude              float64
 21  Longitude             float64
 22  Location              object 
 23  geometry    

In [49]:
# Date 컬럼의 형식 object(문자열) 확인 -> 형식 변환
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S", errors='coerce')
# 날짜(Date)와 시간(Time) 나누기
df['F_Date'] = df['Date'].dt.strftime('%Y-%m-%d')
df['Time'] = df['Date'].dt.strftime('%H:%M:%S')

# 결과 출력
print(df[['F_Date', 'Time']])

             F_Date      Time
0        2008-05-17  18:00:00
1        2008-05-27  01:00:00
2        2008-08-05  22:37:00
3        2008-12-27  20:00:00
4        2010-11-19  09:00:00
...             ...       ...
8190149  2025-03-16  12:36:00
8190150  2025-03-16  23:00:00
8190151  2025-03-16  20:44:00
8190152  2025-03-16  15:45:00
8190153  2025-03-16  09:44:00

[8189480 rows x 2 columns]


In [59]:
# 먼저 datetime으로 변환 (Date가 object인 경우)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# 중복된 Case Number만 추출
dupes = df[df.duplicated(subset=['Case Number'], keep=False)]

# 중복된 행들 중 연도만 추출
dupes['Year'] = dupes['Date'].dt.year

# 연도별 중복 수 세기
dupe_counts_by_year = dupes['Year'].value_counts().sort_index()

# 결과 출력
print(dupe_counts_by_year)

Year
2001     43
2002    196
2003    232
2004    228
2005    162
2006    257
2007    207
2008    254
2009    195
2010    174
2011    198
2012    245
2013    167
2014    148
2015    212
2016     64
2017     86
2018     48
2019     36
2020     54
2021     63
2022     92
2023     44
2024     55
2025      8
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupes['Year'] = dupes['Date'].dt.year


In [69]:
# 먼저 datetime으로 변환 (Date가 object인 경우)
df_unique['Date'] = pd.to_datetime(df_unique['Date'], errors='coerce')

# 중복된 Case Number만 추출
dupes = df_unique[df_unique.duplicated(subset=['Case Number'], keep=False)]

# 중복된 행들 중 연도만 추출
dupes['Year'] = dupes['Date'].dt.year

# 연도별 중복 수 세기
dupe_counts_by_year = dupes['Year'].value_counts().sort_index()

# 결과 출력
print(dupe_counts_by_year)

Series([], Name: count, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['Date'] = pd.to_datetime(df_unique['Date'], errors='coerce')


In [66]:
# Case Number 기준으로 중복된 것들만 추출
dupes = df[df.duplicated(subset=['Case Number'], keep=False)]

# 중복된 것 중에서도 완전히 같은 행 제거 (즉, 값이 다른 애들만 남김)
dupes_unique = dupes.drop_duplicates()

# 같은 Case Number인데 다른 정보가 담긴 행들을 찾기
diff_cases = dupes_unique.groupby('Case Number').filter(lambda x: len(x) > 1)

# 상위 몇 개만 보기
diff_cases.sort_values('Case Number').head()


Unnamed: 0,index,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,geometry,Community Area Name,F_Date,Time
100961,100961,650,G023235,2001-01-11 18:46:00,101XX S LASALLE ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,...,1837772.0,2001,09/19/2022 03:41:05 PM,41.710181,-87.627189,"(41.710180828, -87.627189271)",POINT (-87.627189271 41.710180828),Roseland,2001-01-11,18:46:00
69208,69208,651,G023235,2001-01-11 21:10:00,101XX S LASALLE ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,...,1837772.0,2001,09/19/2022 03:41:05 PM,41.710181,-87.627189,"(41.710180828, -87.627189271)",POINT (-87.627189271 41.710180828),Roseland,2001-01-11,21:10:00
95696,95696,682,G083440,2001-02-09 23:40:00,046XX S LAWLER AVE,110,HOMICIDE,FIRST DEGREE MURDER,TRUCK,False,...,1873484.0,2001,09/19/2022 03:41:05 PM,41.808871,-87.749308,"(41.808871048, -87.749307968)",POINT (-87.749307968 41.808871048),Garfield Ridge,2001-02-09,23:40:00
131236,131236,681,G083440,2001-02-09 23:40:00,046XX S LAWLER AVE,110,HOMICIDE,FIRST DEGREE MURDER,TRUCK,False,...,1873484.0,2001,09/01/2022 03:42:17 PM,41.808871,-87.749308,"(41.808871048, -87.749307968)",POINT (-87.749307968 41.808871048),Garfield Ridge,2001-02-09,23:40:00
90499,90499,714,G137655,2001-03-10 06:30:00,015XX W LAWRENCE AVE,110,HOMICIDE,FIRST DEGREE MURDER,APARTMENT,True,...,1931956.0,2001,09/19/2022 03:41:05 PM,41.96889,-87.668043,"(41.968890414, -87.668042567)",POINT (-87.668042567 41.968890414),Uptown,2001-03-10,06:30:00


In [67]:
# 1. Case Number 기준으로 중복된 행 추출
dupes = df[df.duplicated(subset=['Case Number'], keep=False)]

# 2. 동일한 Case Number 그룹 중, 사건 유형(Primary Type)이 2개 이상인 경우 필터링
diff_type_cases = dupes.groupby('Case Number').filter(lambda x: x['Primary Type'].nunique() > 1)

# 3. 결과 확인 (상위 10개만 보기)
print(diff_type_cases.sort_values('Case Number').head(10))


Empty DataFrame
Columns: [index, ID, Case Number, Date, Block, IUCR, Primary Type, Description, Location Description, Arrest, Domestic, Beat, District, Ward, Community Area, FBI Code, X Coordinate, Y Coordinate, Year, Updated On, Latitude, Longitude, Location, geometry, Community Area Name, F_Date, Time]
Index: []

[0 rows x 27 columns]


In [68]:
df_unique = df.drop_duplicates(subset=['Case Number'], keep='first', ignore_index=True)

In [56]:
# 완전히 중복된 행만 추출
dupes = df_cleaned[df_cleaned.duplicated()]

# 연도 추출
dupes['Year'] = dupes['Date'].dt.year

# 연도별 중복 수 세기
dupe_counts_by_year = dupes['Year'].value_counts().sort_index()

# 결과 출력
print(dupe_counts_by_year)


Series([], Name: count, dtype: int64)


In [72]:
df_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8187697 entries, 0 to 8187696
Data columns (total 27 columns):
 #   Column                Dtype  
---  ------                -----  
 0   index                 int64  
 1   ID                    int64  
 2   Case Number           object 
 3   Date                  object 
 4   Block                 object 
 5   IUCR                  object 
 6   Primary Type          object 
 7   Description           object 
 8   Location Description  object 
 9   Arrest                bool   
 10  Domestic              bool   
 11  Beat                  int64  
 12  District              float64
 13  Ward                  float64
 14  Community Area        float64
 15  FBI Code              object 
 16  X Coordinate          float64
 17  Y Coordinate          float64
 18  Year                  int64  
 19  Updated On            object 
 20  Latitude              float64
 21  Longitude             float64
 22  Location              object 
 23  geometr

In [73]:
df = df_unique.drop(columns=['index'])

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8187697 entries, 0 to 8187696
Data columns (total 26 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  int64  
 11  District              float64
 12  Ward                  float64
 13  Community Area        float64
 14  FBI Code              object 
 15  X Coordinate          float64
 16  Y Coordinate          float64
 17  Year                  int64  
 18  Updated On            object 
 19  Latitude              float64
 20  Longitude             float64
 21  Location              object 
 22  geometry              object 
 23  Communi

In [None]:
df.to_csv("crime_final1.csv", index=False)