In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import pingouin as pg
import sys
import reverse_geocode

# Create a dictionary of versions
versions = {
    "Python": sys.version.split()[0],
    "Pandas": pd.__version__,
    "NumPy": np.__version__,
    "Matplotlib": plt.matplotlib.__version__,
    "Seaborn": sns.__version__,
    "Statsmodels": sm.__version__,
    "Pingouin": pg.__version__
}

# Display as a clean DataFrame
df_versions = pd.DataFrame(list(versions.items()), columns=['Library', 'Version'])
df_versions

Unnamed: 0,Library,Version
0,Python,3.13.9
1,Pandas,2.3.3
2,NumPy,2.3.4
3,Matplotlib,3.10.7
4,Seaborn,0.13.2
5,Statsmodels,0.14.5
6,Pingouin,0.5.5


## Read Data

In [2]:
# read csv
df = pd.read_csv("../Data/Crimes_2001_to_Present_20251226.csv")

In [3]:
# display all columns
pd.set_option('display.max_columns', None)

# display
df.head()

# reset options
# pd.reset_option('display.max_columns')

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,14059178,JJ526784,12/18/2025 12:00:00 AM,070XX S CHAPPEL AVE,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,331,3.0,5.0,43.0,26,1191097.0,1858637.0,2025,12/25/2025 03:41:39 PM,41.767109,-87.575104,"(41.7671089, -87.575103515)"
1,14059785,JJ527520,12/18/2025 12:00:00 AM,085XX S DANTE AVE,820,THEFT,$500 AND UNDER,STREET,False,False,412,4.0,8.0,45.0,6,1187355.0,1848745.0,2025,12/25/2025 03:41:39 PM,41.740054,-87.589133,"(41.74005402, -87.589132903)"
2,14062578,JJ530929,12/18/2025 12:00:00 AM,077XX S GREEN ST,820,THEFT,$500 AND UNDER,RESIDENCE - YARD (FRONT / BACK),False,False,621,6.0,17.0,71.0,6,1171977.0,1853575.0,2025,12/25/2025 03:41:39 PM,41.753659,-87.645334,"(41.753659288, -87.64533412)"
3,14063828,JJ532379,12/18/2025 12:00:00 AM,048XX W BELLE PLAINE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,1624,16.0,45.0,15.0,7,1143338.0,1926819.0,2025,12/25/2025 03:41:39 PM,41.955231,-87.748459,"(41.955231253, -87.748458925)"
4,14059235,JJ526636,12/18/2025 12:00:00 AM,029XX W POLK ST,281,CRIMINAL SEXUAL ASSAULT,NON-AGGRAVATED,APARTMENT,False,False,1135,11.0,28.0,27.0,2,1156879.0,1896229.0,2025,12/25/2025 03:41:39 PM,41.871025,-87.69951,"(41.871025414, -87.699510379)"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8466423 entries, 0 to 8466422
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  int64  
 11  District              float64
 12  Ward                  float64
 13  Community Area        float64
 14  FBI Code              object 
 15  X Coordinate          float64
 16  Y Coordinate          float64
 17  Year                  int64  
 18  Updated On            object 
 19  Latitude              float64
 20  Longitude             float64
 21  Location              object 
dtypes: bool(2), float64(7), int64(3), object(1

In [5]:
df.shape

(8466423, 22)

In [None]:
def 

In [None]:
df[(df['Primary Type'].str.contains('HOMICIDE', case=False, na=False)) & (df.Year == 2025)]

In [6]:
# coordinates = [(41.894491615, -87.718722846)] # Example coordinates (lat, long)

coordinates = (41.722434522, -87.632168804)  # Single tuple
location_info = reverse_geocode.get(coordinates)
print(location_info)


{'country_code': 'US', 'city': 'Chatham', 'latitude': 41.74115, 'longitude': -87.61255, 'population': 31392, 'state': 'Illinois', 'county': 'Cook County', 'country': 'United States'}


In [64]:
location_info.get('city', np.nan)

'West Garfield Park'

In [None]:
def extract_address(row):
    try:
        location = geolocator.reverse((row['Latitude'], row['Longitude']))
        addr = location.raw['address']
        return {
            'neighborhood': addr.get('neighbourhood', ''),
            'city': addr.get('city', ''),
            'county': addr.get('county', ''),
            'state': addr.get('state', '')
        }
    except:
        return {'neighborhood': 'Unknown', 'city': '', 'county': '', 'state': ''}

df[['neigh', 'city', 'county', 'state']] = pd.json_normalize(
    df[['Latitude', 'Longitude']].head(50).apply(extract_address, axis=1)
)


In [9]:
# Initialize geocoder (free, no API key)
geolocator = Nominatim(user_agent="crime_analyzer")

# Single coordinate (Chicago example)
lat, lon = 41.955231, -87.748459
location = geolocator.reverse((lat, lon))
print(location.address)  
print(location.raw['address'])

4824, West Belle Plaine Avenue, Grayland, Portage Park, Chicago, Jefferson Township, Cook County, Illinois, 60641, United States
{'house_number': '4824', 'road': 'West Belle Plaine Avenue', 'neighbourhood': 'Grayland', 'suburb': 'Portage Park', 'city': 'Chicago', 'municipality': 'Jefferson Township', 'county': 'Cook County', 'state': 'Illinois', 'ISO3166-2-lvl4': 'US-IL', 'postcode': '60641', 'country': 'United States', 'country_code': 'us'}


In [24]:
# Initialize geocoder (free, no API key)
geolocator = Nominatim(user_agent="crime_analyzer")

# Single coordinate (Chicago example)
lat, lon = 41.871025,	-87.699510
location = geolocator.reverse((lat, lon))
print(location.address)  
print(location.raw['address'])

2922, West Polk Street, East Garfield Park, Chicago, West Chicago Township, Cook County, Illinois, 60624, United States
{'house_number': '2922', 'road': 'West Polk Street', 'suburb': 'East Garfield Park', 'city': 'Chicago', 'municipality': 'West Chicago Township', 'county': 'Cook County', 'state': 'Illinois', 'ISO3166-2-lvl4': 'US-IL', 'postcode': '60624', 'country': 'United States', 'country_code': 'us'}


In [25]:
location.raw['address']

{'house_number': '2922',
 'road': 'West Polk Street',
 'suburb': 'East Garfield Park',
 'city': 'Chicago',
 'municipality': 'West Chicago Township',
 'county': 'Cook County',
 'state': 'Illinois',
 'ISO3166-2-lvl4': 'US-IL',
 'postcode': '60624',
 'country': 'United States',
 'country_code': 'us'}

In [21]:
address = location.raw['address']
address

{'house_number': '4824',
 'road': 'West Belle Plaine Avenue',
 'neighbourhood': 'Grayland',
 'suburb': 'Portage Park',
 'city': 'Chicago',
 'municipality': 'Jefferson Township',
 'county': 'Cook County',
 'state': 'Illinois',
 'ISO3166-2-lvl4': 'US-IL',
 'postcode': '60641',
 'country': 'United States',
 'country_code': 'us'}

In [23]:
address.get('neighbourhood', 'N/A')

'Grayland'

In [19]:
# Crime type per year
df.groupby([df.Year, 'Primary Type']).size().unstack(fill_value=0)

Primary Type,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIM SEXUAL ASSAULT,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,...,OTHER OFFENSE,PROSTITUTION,PUBLIC INDECENCY,PUBLIC PEACE VIOLATION,RITUALISM,ROBBERY,SEX OFFENSE,STALKING,THEFT,WEAPONS VIOLATION
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001,1011,31388,93471,26014,1,1770,55865,47,13242,14959,...,29669,6026,9,2750,8,18441,2238,204,99291,4278
2002,1032,31524,94152,25623,0,1800,55942,39,13881,13740,...,32602,6408,8,2457,2,18523,2173,200,98334,4282
2003,955,29477,88380,25157,0,1563,55011,54,14807,13464,...,31149,6214,6,2430,2,17332,2079,247,98876,4211
2004,778,28850,87135,24564,0,1524,53164,59,15913,13263,...,29533,7476,9,2495,1,15978,1805,215,95464,4298
2005,691,27067,83964,25503,0,1509,54548,53,16655,13592,...,28028,6124,4,2730,2,16047,1820,192,85684,4106
2006,726,25945,80666,24324,0,1427,57124,61,14505,13612,...,27101,7034,4,3068,7,15969,1576,186,86241,3821
2007,712,26313,79595,24858,0,1498,53749,67,13699,14180,...,26863,6087,5,3315,1,15450,1531,213,85156,3554
2008,644,25448,75930,26218,0,1500,52843,67,12310,14943,...,26532,5141,4,3013,0,16703,1504,190,88437,3877
2009,616,22861,68460,26767,0,1390,47725,61,10851,13895,...,25601,3940,10,3147,0,15981,1268,168,80977,4159
2010,522,21537,65403,26422,0,1319,40654,79,9401,12592,...,22012,2484,7,3538,0,14275,1145,189,76758,3704


In [15]:
df["Primary Type"].value_counts()

Primary Type
THEFT                                1798014
BATTERY                              1542513
CRIMINAL DAMAGE                       962723
NARCOTICS                             764952
ASSAULT                               568139
OTHER OFFENSE                         528217
BURGLARY                              447444
MOTOR VEHICLE THEFT                   434045
DECEPTIVE PRACTICE                    390729
ROBBERY                               315502
CRIMINAL TRESPASS                     227660
WEAPONS VIOLATION                     125666
PROSTITUTION                           70454
OFFENSE INVOLVING CHILDREN             60756
PUBLIC PEACE VIOLATION                 54957
SEX OFFENSE                            34452
CRIM SEXUAL ASSAULT                    27268
INTERFERENCE WITH PUBLIC OFFICER       20371
LIQUOR LAW VIOLATION                   15418
GAMBLING                               14666
ARSON                                  14480
HOMICIDE                               140

In [12]:
df.Year.unique()

array([2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015,
       2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004,
       2003, 2002, 2001])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8466423 entries, 0 to 8466422
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  int64  
 11  District              float64
 12  Ward                  float64
 13  Community Area        float64
 14  FBI Code              object 
 15  X Coordinate          float64
 16  Y Coordinate          float64
 17  Year                  int64  
 18  Updated On            object 
 19  Latitude              float64
 20  Longitude             float64
 21  Location              object 
dtypes: bool(2), float64(7), int64(3), object(1

In [20]:
# Simulate crime data: 3 neighborhood types (urban, suburban, rural), n=100 each
np.random.seed(1776)
urban = np.random.normal(45, 5, 100)      # High crime
suburban = np.random.normal(20, 4, 100)   # Medium
rural = np.random.normal(5, 2, 100)       # Low

df = pd.DataFrame({
    'burglaries': np.concatenate([urban, suburban, rural]),
    'neighborhood': np.repeat(['urban', 'suburban', 'rural'], 100)
})

df.head()

Unnamed: 0,burglaries,neighborhood
0,45.059813,urban
1,39.20704,urban
2,48.228861,urban
3,38.516195,urban
4,44.27243,urban


### Descriptives and Visualization

In [8]:
# Summary stats by group
print(pg.descriptives('burglaries', data=df, groupby='neighborhood'))

AttributeError: module 'pingouin' has no attribute 'descriptives'

In [None]:
# Summary stats by group
print(pg.descriptives('burglaries', data=df, groupby='neighborhood'))

# Boxplot (reveals clear separation)
sns.boxplot(data=df, x='neighborhood', y='burglaries')
plt.title('Burglary Rates by Neighborhood Type')
plt.show()


In [2]:
# Check Assumptions
# Normality
print('Normality Test:')
print(pg.normality(df, dv='burglaries', group='neighborhood'))

Normality Test:
                     W      pval  normal
neighborhood                            
urban         0.991969  0.818999    True
suburban      0.987801  0.493632    True
rural         0.992402  0.849560    True


In [3]:
# Variance
print("\nVariance Test:")
print(pg.homoscedasticity(df, dv='burglaries', group = 'neighborhood'))


Variance Test:
               W          pval  equal_var
levene  17.16964  8.786251e-08      False


In [5]:
# Robust ANOVA (Welch's)
aov = pg.welch_anova(dv='burglaries', between='neighborhood', data = df)
print("\nWelch's ANOVA Rersults:")
print(aov)


Welch's ANOVA Rersults:
         Source  ddof1      ddof2            F          p-unc       np2
0  neighborhood      2  177.20733  4058.268877  1.018697e-148  0.954216
