In [21]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
df = pd.read_csv("chicago crime - 2001~current.csv", encoding='utf-8', low_memory=False)

In [25]:
# 중복된 Case Number만 추출
duplicate_cases = df[df.duplicated(subset='Case Number', keep=False)]

# 중복된 Case Number 개수 출력
num_duplicates = duplicate_cases['Case Number'].nunique()
print("중복된 Case Number 개수:", num_duplicates)

중복된 Case Number 개수: 493


In [26]:
# 먼저 datetime으로 변환 (Date가 object인 경우)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# 중복된 Case Number만 추출
dupes = df[df.duplicated(subset=['Case Number'], keep=False)]

# 중복된 행들 중 연도만 추출
dupes['Year'] = dupes['Date'].dt.year

# 연도별 중복 수 세기
dupe_counts_by_year = dupes['Year'].value_counts().sort_index()

# 결과 출력
print(dupe_counts_by_year)

Year
2001    43
2002    38
2003    56
2004    20
2005    14
2006    35
2007    33
2008    42
2009    37
2010    46
2011    28
2012    49
2013    25
2014    16
2015    50
2016    64
2017    86
2018    48
2019    36
2020    54
2021    63
2022    92
2023    44
2024    55
2025     8
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupes['Year'] = dupes['Date'].dt.year


In [29]:
# Case Number 기준으로 중복된 것들만 추출
dupes = df[df.duplicated(subset=['Case Number'], keep=False)]

# 중복된 것 중에서도 완전히 같은 행 제거 (즉, 값이 다른 애들만 남김)
dupes_unique = dupes.drop_duplicates()

# 같은 Case Number인데 다른 정보가 담긴 행들을 찾기
diff_cases = dupes_unique.groupby('Case Number').filter(lambda x: len(x) > 1)

# 상위 몇 개만 보기
diff_cases.sort_values('Case Number').head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
184382,651,G023235,2001-01-11 21:10:00,101XX S LASALLE ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,True,...,,,01A,1177058.0,1837772.0,2001,09/19/2022 03:41:05 PM,41.710181,-87.627189,"(41.710180828, -87.627189271)"
217497,650,G023235,2001-01-11 18:46:00,101XX S LASALLE ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,True,...,,,01A,1177058.0,1837772.0,2001,09/19/2022 03:41:05 PM,41.710181,-87.627189,"(41.710180828, -87.627189271)"
212007,682,G083440,2001-02-09 23:40:00,046XX S LAWLER AVE,110,HOMICIDE,FIRST DEGREE MURDER,TRUCK,False,False,...,23.0,56.0,01A,1143470.0,1873484.0,2001,09/19/2022 03:41:05 PM,41.808871,-87.749308,"(41.808871048, -87.749307968)"
249019,681,G083440,2001-02-09 23:40:00,046XX S LAWLER AVE,110,HOMICIDE,FIRST DEGREE MURDER,TRUCK,False,False,...,23.0,56.0,01A,1143470.0,1873484.0,2001,09/01/2022 03:42:17 PM,41.808871,-87.749308,"(41.808871048, -87.749307968)"
206588,714,G137655,2001-03-10 06:30:00,015XX W LAWRENCE AVE,110,HOMICIDE,FIRST DEGREE MURDER,APARTMENT,True,False,...,46.0,3.0,01A,1165173.0,1931956.0,2001,09/19/2022 03:41:05 PM,41.96889,-87.668043,"(41.968890414, -87.668042567)"


In [30]:
df_unique = df.drop_duplicates(subset=['Case Number'], keep='first', ignore_index=True)

In [31]:
# 1. Case Number 기준으로 중복된 행 추출
dupes = df_unique[df_unique.duplicated(subset=['Case Number'], keep=False)]

# 2. 동일한 Case Number 그룹 중, 사건 유형(Primary Type)이 2개 이상인 경우 필터링
diff_type_cases = dupes.groupby('Case Number').filter(lambda x: x['Primary Type'].nunique() > 1)

# 3. 결과 확인 (상위 10개만 보기)
print(diff_type_cases.sort_values('Case Number').head(10))

Empty DataFrame
Columns: [ID, Case Number, Date, Block, IUCR, Primary Type, Description, Location Description, Arrest, Domestic, Beat, District, Ward, Community Area, FBI Code, X Coordinate, Y Coordinate, Year, Updated On, Latitude, Longitude, Location]
Index: []

[0 rows x 22 columns]


In [32]:
df_unique.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     14099
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    614813
Community Area          613440
FBI Code                     0
X Coordinate             91682
Y Coordinate             91682
Year                         0
Updated On                   0
Latitude                 91682
Longitude                91682
Location                 91682
dtype: int64

In [34]:
# Block 별 최빈값(mode) 계산
block_mode_values = df_unique.groupby('Block')[['District', 'Latitude', 'Longitude', 'X Coordinate', 'Y Coordinate', 'Location', 'Ward', 'Community Area']].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)

# 결측치가 있는 행에 대해서 Block별 최빈값으로 채우기
chicago_filled = df_unique.copy()

for column in ['District', 'Latitude', 'Longitude', 'X Coordinate', 'Y Coordinate', 'Location', 'Ward', 'Community Area']:
    chicago_filled[column] = chicago_filled.apply(
        lambda row: block_mode_values.at[row['Block'], column] if pd.isnull(row[column]) and row['Block'] in block_mode_values.index else row[column],
        axis=1
    )

In [35]:
chicago_filled.isnull().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     14099
Arrest                       0
Domestic                     0
Beat                         0
District                     0
Ward                    437933
Community Area          438178
FBI Code                     0
X Coordinate              2956
Y Coordinate              2956
Year                         0
Updated On                   0
Latitude                  2956
Longitude                 2956
Location                  2956
dtype: int64

In [36]:
df = chicago_filled.dropna(subset=['X Coordinate','Y Coordinate','Latitude', 'Longitude', 'Location'])

In [37]:
df.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     13853
Arrest                       0
Domestic                     0
Beat                         0
District                     0
Ward                    435844
Community Area          436179
FBI Code                     0
X Coordinate                 0
Y Coordinate                 0
Year                         0
Updated On                   0
Latitude                     0
Longitude                    0
Location                     0
dtype: int64

In [38]:
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')


In [39]:
# 시기 기준으로 나누기
df_2001_2015 = df[df['Date'].dt.year <= 2015]
df_2016_2022 = df[(df['Date'].dt.year > 2015) & (df['Date'].dt.year < 2023)]
df_2023 = df[df['Date'].dt.year >= 2023]

# 파일로 저장 
df_2001_2015.to_csv("crime_2001_2015.csv", index=False)
df_2016_2022.to_csv("crime_2016_2022.csv", index=False)
df_2023.to_csv("crime_2023.csv", index=False)

In [44]:
def load_ward_gdf(csv_path, ward_col='WARD'):
    df = pd.read_csv(csv_path)
    df = df[df[ward_col].notna()]              # NA 제거
    df = df[df[ward_col] != 'OUT']             # 'OUT' 제거
    df[ward_col] = df[ward_col].astype(float)  # float형 변환
    df['geometry'] = df['the_geom'].apply(wkt.loads)
    return gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

In [46]:
df = pd.read_csv("crime_2001_2015.csv")
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude'])
crime_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
ward_gdf = load_ward_gdf("Boundaries - Wards (2003-2015).csv") 
joined = gpd.sjoin(crime_gdf, ward_gdf[['WARD', 'geometry']], how='left', predicate='within')
joined['Ward'] = joined['Ward'].fillna(joined['WARD'])  
joined.drop(columns=['WARD', 'index_right'], inplace=True)
joined.to_csv("crime_2001_2015_filled.csv", index=False)

In [47]:
df2 = pd.read_csv("crime_2016_2022.csv")
df2['geometry'] = gpd.points_from_xy(df2['Longitude'], df2['Latitude'])
gdf2 = gpd.GeoDataFrame(df2, geometry='geometry', crs='EPSG:4326')
ward2 = load_ward_gdf("Boundaries - Wards (2015-2023).csv")
joined2 = gpd.sjoin(gdf2, ward2[['WARD', 'geometry']], how='left', predicate='within')
joined2['Ward'] = joined2['Ward'].fillna(joined2['WARD'])
joined2.drop(columns=['WARD', 'index_right'], inplace=True)
joined2.to_csv("crime_2016_2022_filled.csv", index=False)

In [48]:
df3 = pd.read_csv("crime_2023.csv")
df3['geometry'] = gpd.points_from_xy(df3['Longitude'], df3['Latitude'])
gdf3 = gpd.GeoDataFrame(df3, geometry='geometry', crs='EPSG:4326')
ward3 = load_ward_gdf("Boundaries - Wards (2023-).csv", ward_col='ward_id')  
joined3 = gpd.sjoin(gdf3, ward3[['ward_id', 'geometry']], how='left', predicate='within')
joined3['Ward'] = joined3['Ward'].fillna(joined3['ward_id'])
joined3.drop(columns=['ward_id', 'index_right'], inplace=True)
joined3.to_csv("crime_2023_filled.csv", index=False)

In [49]:
# 1. 시기별 파일 불러오기
df1 = pd.read_csv("crime_2001_2015_filled.csv")
df2 = pd.read_csv("crime_2016_2022_filled.csv")
df3 = pd.read_csv("crime_2023_filled.csv")
# 2. 하나로 합치기
df_all = pd.concat([df1, df2, df3], ignore_index=True)
# 3. 저장
df_all.to_csv("crime_all_filled.csv", index=False)

In [50]:
# 최종 병합된 파일 로드
df = pd.read_csv("crime_all_filled.csv")
# 10개 샘플만 보기
print(df[['Date', 'Primary Type', 'Latitude', 'Longitude', 'Ward']].sample(10))

                        Date Primary Type   Latitude  Longitude  Ward
3863388  2009-05-06 16:50:00    NARCOTICS  41.746831 -87.655334  21.0
3315082  2008-03-11 22:11:46    NARCOTICS  41.757224 -87.562326   7.0
6096207  2016-06-14 22:00:00        THEFT  41.955514 -87.774412  38.0
5735053  2015-07-10 05:45:00        THEFT  41.945789 -87.774997  38.0
2077159  2005-05-30 11:40:00        THEFT  41.736824 -87.663090  18.0
870065   2002-09-25 11:00:00        THEFT  41.901553 -87.665803   1.0
2772468  2006-09-05 21:44:32    NARCOTICS  41.779427 -87.665986  15.0
4897876  2012-07-20 11:00:00     BURGLARY  41.959663 -87.729388  39.0
4635484  2011-10-01 12:25:00      BATTERY  41.785288 -87.602796  20.0
6685851  2018-08-15 01:30:00      ROBBERY  41.797310 -87.589047   4.0


In [51]:
df.isna().sum()

ID                           0
Case Number                  0
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     13853
Arrest                       0
Domestic                     0
Beat                         0
District                     0
Ward                       411
Community Area          436198
FBI Code                     0
X Coordinate                 0
Y Coordinate                 0
Year                         0
Updated On                   0
Latitude                     0
Longitude                    0
Location                     0
geometry                     0
dtype: int64

In [52]:
df.shape

(8278294, 23)

In [53]:
print(df['Community Area'].unique())

[49. 53. 11. 63. 71. 58. 30. 61. 46. 60. 70. 23. 28.  2. 43. 40. 26. 41.
 31. 14. 65. 44. 42. 66. 29. 21.  3.  8.  1. 27. 51. 68. 19. 74. 75. 69.
 56. 38. 54. 13.  6. 48. 24. 52. 45. 25. nan 32.  5. 36. 67. 57. 10. 50.
 12. 35. 55. 73.  9. 22. 39. 20. 72. 47. 16.  7. 77. 15. 76. 34. 62.  4.
 18. 17. 59. 37. 64. 33.  0.]


In [54]:
(df['Community Area'] == 0).sum()

76

In [55]:
df['Community Area'] = df['Community Area'].replace(0, pd.NA)

In [56]:
area_df = pd.read_csv("Boundaries - Community Areas (current).csv")
area_df = area_df[area_df['AREA_NUMBE'].notna()]
area_df['AREA_NUMBE'] = area_df['AREA_NUMBE'].astype(float)
area_df['geometry'] = area_df['the_geom'].apply(wkt.loads)
area_gdf = gpd.GeoDataFrame(area_df, geometry='geometry', crs='EPSG:4326')
area_gdf = area_gdf[['AREA_NUMBE', 'geometry']].rename(columns={'AREA_NUMBE': 'Area_Num'})
df = df.reset_index()
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude'])
crime_gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
joined = gpd.sjoin(crime_gdf, area_gdf, how='left', predicate='within')
joined['Community Area'] = joined['Community Area'].fillna(joined['Area_Num']).astype(float)
joined.drop(columns=['Area_Num', 'index_right'], inplace=True)
joined.to_csv("crime_final2.csv", index=False)

  joined['Community Area'] = joined['Community Area'].fillna(joined['Area_Num']).astype(float)


In [57]:
df = pd.read_csv("crime_final2.csv")

# Community Area 결측치 확인
print("결측치 수:", df['Community Area'].isna().sum())

# 어떤 값들이 있는지 유니크하게 확인
print("유니크 값:", sorted(df['Community Area'].dropna().unique()))

결측치 수: 347
유니크 값: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0]


In [58]:
df.isna().sum()

index                       0
ID                          0
Case Number                 0
Date                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description    13853
Arrest                      0
Domestic                    0
Beat                        0
District                    0
Ward                      411
Community Area            347
FBI Code                    0
X Coordinate                0
Y Coordinate                0
Year                        0
Updated On                  0
Latitude                    0
Longitude                   0
Location                    0
geometry                    0
dtype: int64

In [59]:
df = df.dropna(subset=['Ward', 'Community Area'])

In [60]:
df.isna().sum()

index                       0
ID                          0
Case Number                 0
Date                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description    13853
Arrest                      0
Domestic                    0
Beat                        0
District                    0
Ward                        0
Community Area              0
FBI Code                    0
X Coordinate                0
Y Coordinate                0
Year                        0
Updated On                  0
Latitude                    0
Longitude                   0
Location                    0
geometry                    0
dtype: int64

In [63]:
# 매핑 딕셔너리
community_area_map = {
    1: 'Rogers Park', 2: 'West Ridge', 3: 'Uptown', 4: 'Lincoln Square',
    5: 'North Center', 6: 'Lake View', 7: 'Lincoln Park', 8: 'Near North Side',
    9: 'Edison Park', 10: 'Norwood Park', 11: 'Jefferson Park', 12: 'Forest Glen',
    13: 'North Park', 14: 'Albany Park', 15: 'Portage Park', 16: 'Irving Park',
    17: 'Dunning', 18: 'Montclare', 19: 'Belmont Cragin', 20: 'Hermosa',
    21: 'Avondale', 22: 'Logan Square', 23: 'Humboldt Park', 24: 'West Town',
    25: 'Austin', 26: 'West Garfield Park', 27: 'East Garfield Park',
    28: 'Near West Side', 29: 'North Lawndale', 30: 'South Lawndale',
    31: 'Lower West Side', 32: 'Loop', 33: 'Near South Side', 34: 'Armour Square',
    35: 'Douglas', 36: 'Oakland', 37: 'Fuller Park', 38: 'Grand Boulevard',
    39: 'Kenwood', 40: 'Washington Park', 41: 'Hyde Park', 42: 'Woodlawn',
    43: 'South Shore', 44: 'Chatham', 45: 'Avalon Park', 46: 'South Chicago',
    47: 'Burnside', 48: 'Calumet Heights', 49: 'Roseland', 50: 'Pullman',
    51: 'South Deering', 52: 'East Side', 53: 'West Pullman', 54: 'Riverdale',
    55: 'Hegewisch', 56: 'Garfield Ridge', 57: 'Archer Heights',
    58: 'Brighton Park', 59: 'McKinley Park', 60: 'Bridgeport',
    61: 'New City', 62: 'West Elsdon', 63: 'Gage Park', 64: 'Clearing',
    65: 'West Lawn', 66: 'Chicago Lawn', 67: 'West Englewood',
    68: 'Englewood', 69: 'Greater Grand Crossing', 70: 'Ashburn',
    71: 'Auburn Gresham', 72: 'Beverly', 73: 'Washington Heights',
    74: 'Mount Greenwood', 75: 'Morgan Park', 76: 'OHare', 77: 'Edgewater'
}

# 매핑 적용
df['Community Area Name'] = df['Community Area'].map(lambda x: community_area_map.get(int(x)) if pd.notna(x) else None)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8277869 entries, 0 to 8278293
Data columns (total 25 columns):
 #   Column                Dtype  
---  ------                -----  
 0   index                 int64  
 1   ID                    int64  
 2   Case Number           object 
 3   Date                  object 
 4   Block                 object 
 5   IUCR                  object 
 6   Primary Type          object 
 7   Description           object 
 8   Location Description  object 
 9   Arrest                bool   
 10  Domestic              bool   
 11  Beat                  int64  
 12  District              float64
 13  Ward                  float64
 14  Community Area        float64
 15  FBI Code              object 
 16  X Coordinate          float64
 17  Y Coordinate          float64
 18  Year                  int64  
 19  Updated On            object 
 20  Latitude              float64
 21  Longitude             float64
 22  Location              object 
 23  geometry    

In [64]:
# Date 컬럼의 형식 object(문자열) 확인 -> 형식 변환
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S", errors='coerce')
# 날짜(Date)와 시간(Time) 나누기
df['F_Date'] = df['Date'].dt.strftime('%Y-%m-%d')
df['Time'] = df['Date'].dt.strftime('%H:%M:%S')

# 결과 출력
print(df[['F_Date', 'Time']])

             F_Date      Time
0        2008-05-17  18:00:00
1        2008-05-27  01:00:00
2        2008-08-05  22:37:00
3        2008-12-27  20:00:00
4        2015-09-24  00:00:00
...             ...       ...
8278289  2025-03-16  12:36:00
8278290  2025-03-16  23:00:00
8278291  2025-03-16  20:44:00
8278292  2025-03-16  15:45:00
8278293  2025-03-16  09:44:00

[8277869 rows x 2 columns]


In [66]:
df_1 = df.drop(columns=['index'])

In [72]:
# 중복된 Case Number만 추출
duplicate_cases = df_1[df_1.duplicated(subset='Case Number', keep=False)]

# 중복된 Case Number 개수 출력
num_duplicates = duplicate_cases['Case Number'].nunique()
print("중복된 Case Number 개수:", num_duplicates)

중복된 Case Number 개수: 1197


In [73]:
# 중복된 Case Number만 추출
dupes = df_1[df.duplicated(subset=['Case Number'], keep=False)]

# 중복된 행들 중 연도만 추출
dupes['Year'] = dupes['Date'].dt.year

# 연도별 중복 수 세기
dupe_counts_by_year = dupes['Year'].value_counts().sort_index()

# 결과 출력
print(dupe_counts_by_year)

Year
2002    158
2003    176
2004    210
2005    148
2006    224
2007    174
2008    214
2009    160
2010    128
2011    170
2012    196
2013    142
2014    132
2015    162
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupes['Year'] = dupes['Date'].dt.year


In [70]:
dupes = df_1[df_1.duplicated(subset=['Case Number'], keep=False)]

In [75]:
# 중복된 Case Number 중에서, 각각의 Case Number에 속한 고유한 Ward 수를 구함
ward_counts_per_case = (
    df_1[df_1.duplicated(subset='Case Number', keep=False)]
    .groupby('Case Number')['Ward']
    .nunique()
)

# 몇 개의 Case Number가 2개 이상의 Ward에 속해 있는지 확인
ward_multiples = ward_counts_per_case.value_counts().sort_index()

ward_multiples

Ward
1    1178
2      19
Name: count, dtype: int64

In [74]:

# 중복 Case 중 Ward 수 1개인 Case Number 추출
ward_counts_per_case = (
    df_1[df_1.duplicated(subset='Case Number', keep=False)]
    .groupby('Case Number')['Ward']
    .nunique()
)

single_ward_cases = ward_counts_per_case[ward_counts_per_case == 1].index
single_ward_rows = df_1[df_1['Case Number'].isin(single_ward_cases)]

# 해당 Ward 카운트
ward_distribution = single_ward_rows['Ward'].value_counts().sort_values(ascending=False)

print("중복된 Case 중 Ward가 1개인 케이스에서 Ward별 등장 빈도:")
print(ward_distribution)


중복된 Case 중 Ward가 1개인 케이스에서 Ward별 등장 빈도:
Ward
7.0     1228
10.0    1114
8.0       12
9.0        2
Name: count, dtype: int64


In [76]:
import pandas as pd
from collections import Counter

# 먼저 중복된 Case Number 중에서 2개 이상의 Ward에 속하는 경우 필터링
ward_counts_per_case = (
    df_1[df_1.duplicated(subset='Case Number', keep=False)]
    .groupby('Case Number')['Ward']
    .nunique()
)

# 2개 이상의 Ward에 속하는 Case Number만 선택
multi_ward_cases = ward_counts_per_case[ward_counts_per_case > 1].index

# 해당 Case Number에 속한 데이터만 가져오기
multi_ward_rows = df_1[df_1['Case Number'].isin(multi_ward_cases)]

# 각 Ward 등장 횟수 세기
ward_freq = Counter(multi_ward_rows['Ward'])

# Pandas 시리즈로 정렬해서 보기 좋게 출력
ward_freq_series = pd.Series(ward_freq).sort_values(ascending=False)

print("여러 개 Ward에 속한 중복 Case Number들 중 가장 자주 등장한 Ward 순위:")
print(ward_freq_series)


여러 개 Ward에 속한 중복 Case Number들 중 가장 자주 등장한 Ward 순위:
7.0     19
10.0    19
dtype: int64


In [77]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8277869 entries, 0 to 8278293
Data columns (total 26 columns):
 #   Column                Dtype         
---  ------                -----         
 0   ID                    int64         
 1   Case Number           object        
 2   Date                  datetime64[ns]
 3   Block                 object        
 4   IUCR                  object        
 5   Primary Type          object        
 6   Description           object        
 7   Location Description  object        
 8   Arrest                bool          
 9   Domestic              bool          
 10  Beat                  int64         
 11  District              float64       
 12  Ward                  float64       
 13  Community Area        float64       
 14  FBI Code              object        
 15  X Coordinate          float64       
 16  Y Coordinate          float64       
 17  Year                  int64         
 18  Updated On            object        
 19  Latit

In [78]:
df_2 = df_1.drop_duplicates(subset=['Case Number'], keep='first', ignore_index=True)

In [80]:
df_2.to_csv("chicago_crime_final.csv", index=False)