# Chicago Crime Data Analysis

## Data ingestion: Dataset Setup and Loading

In [29]:
import pandas as pd
import matplotlib.pyplot as plt

#libraries for time
from datetime import datetime, timedelta

#ignore warnings
import warnings
warnings.filterwarnings('ignore')



In [30]:
# loading the zip data
# path="../data/archive_3.zip"
# df= pd.read_csv(path,low_memory=True, compression='zip')

# loading the csv file
path="../data/Crimes_2001_to_Present.csv"

# Define data types to reduce memory usage
dtype_dict = {
'ID': 'Int32',
'Case Number': 'string',
'IUCR': 'category',
'Primary Type': 'category',
'Description': 'category',
'Location Description': 'category',
'Arrest': 'bool',
'Domestic': 'bool',
'Beat': 'Int16',
'District': 'Int8',
'Ward': 'Int8',
'Community Area': 'Int8',
'FBI Code': 'category',
'X Coordinate': 'float32',
'Y Coordinate': 'float32',
'Latitude': 'float32',
'Longitude': 'float32'
}
df = pd.read_csv(path, dtype=dtype_dict)

## Preliminary Data Analysis

In [31]:
# let take a snap shot view on our data
print("============ OVERVIEW OF THE DATA =========\n")
print(f"Data info: {df.info()}\n")

print(f"Column list: {df.columns}\n")

print("\n============ Description of the data =========\n")
print(f"{df.describe(include='all')}")



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7784664 entries, 0 to 7784663
Data columns (total 22 columns):
 #   Column                Dtype   
---  ------                -----   
 0   ID                    Int32   
 1   Case Number           string  
 2   Date                  object  
 3   Block                 object  
 4   IUCR                  category
 5   Primary Type          category
 6   Description           category
 7   Location Description  category
 8   Arrest                bool    
 9   Domestic              bool    
 10  Beat                  Int16   
 11  District              Int8    
 12  Ward                  Int8    
 13  Community Area        Int8    
 14  FBI Code              category
 15  X Coordinate          float32 
 16  Y Coordinate          float32 
 17  Year                  int64   
 18  Updated On            object  
 19  Latitude              float32 
 20  Longitude             float32 
 21  Location              object  
dtypes: Int16(1), Int3

In [32]:
# checking first five rows
print ("\nThe last five rows")
print(df.head())



The last five rows
         ID Case Number                    Date                  Block  IUCR  \
0  10224738    HY411648  09/05/2015 01:30:00 PM        043XX S WOOD ST  0486   
1  10224739    HY411615  09/04/2015 11:30:00 AM    008XX N CENTRAL AVE  0870   
2  11646166    JC213529  09/01/2018 12:01:00 AM  082XX S INGLESIDE AVE  0810   
3  10224740    HY411595  09/05/2015 12:45:00 PM      035XX W BARRY AVE  2023   
4  10224741    HY411610  09/05/2015 01:00:00 PM    0000X N LARAMIE AVE  0560   

  Primary Type              Description Location Description  Arrest  \
0      BATTERY  DOMESTIC BATTERY SIMPLE            RESIDENCE   False   
1        THEFT           POCKET-PICKING              CTA BUS   False   
2        THEFT                OVER $500            RESIDENCE   False   
3    NARCOTICS    POSS: HEROIN(BRN/TAN)             SIDEWALK    True   
4      ASSAULT                   SIMPLE            APARTMENT   False   

   Domestic  ...  Ward  Community Area  FBI Code  X Coordinate Y C

In [33]:
# checking last five rows
print ("\nThe last five rows")
print(df.tail())


The last five rows
               ID Case Number                    Date                Block  \
7784659  12847575    JF420478  09/01/2022 05:00:00 AM      005XX W SURF ST   
7784660  12847801    JF420319  07/08/2022 12:00:00 AM  114XX S PRAIRIE AVE   
7784661  12847324    JF420102  09/27/2022 11:00:00 AM      023XX E 70TH ST   
7784662  12847570    JF420427  09/03/2022 10:25:00 AM   052XX W CARMEN AVE   
7784663  12840464    JF411839  09/26/2022 07:20:00 PM    0000X N MASON AVE   

         IUCR        Primary Type                    Description  \
7784659  2825       OTHER OFFENSE        HARASSMENT BY TELEPHONE   
7784660  1130  DECEPTIVE PRACTICE       FRAUD OR CONFIDENCE GAME   
7784661  0810               THEFT                      OVER $500   
7784662  2021           NARCOTICS         POSSESS - BARBITURATES   
7784663  143A   WEAPONS VIOLATION  UNLAWFUL POSSESSION - HANDGUN   

                    Location Description  Arrest  Domestic  ...  Ward  \
7784659                      

In [34]:
# checking the missing data
df.isna().sum()

ID                           0
Case Number                  4
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     10381
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    614848
Community Area          613476
FBI Code                     0
X Coordinate             86848
Y Coordinate             86848
Year                         0
Updated On                   0
Latitude                 86848
Longitude                86848
Location                 86848
dtype: int64

## Data Cleaning and Preprocessing

### Let extract data from data

In [35]:
 # Ensure Date is datetime
df['Date'] = pd.to_datetime(
    df['Date'],
    format="%m/%d/%Y %I:%M:%S %p",
    errors="coerce"
)


In [36]:
# Extract date components
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour
df['Day_of_Week'] = df['Date'].dt.dayofweek  # 0=Monday, 6=Sunday
df['Day_Name'] = df['Date'].dt.day_name()
df['Month_Name'] = df['Date'].dt.month_name()
df['Quarter'] = df['Date'].dt.quarter
df['Is_Weekend'] = df['Day_of_Week'].isin([5, 6])  # Saturday, Sunday

In [37]:
# let see the new columns
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'Month', 'Day', 'Hour', 'Day_of_Week', 'Day_Name',
       'Month_Name', 'Quarter', 'Is_Weekend'],
      dtype='object')

### Now let remove missing values

In [38]:
# let remove missing value
df.isna().sum()

ID                           0
Case Number                  4
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description     10381
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    614848
Community Area          613476
FBI Code                     0
X Coordinate             86848
Y Coordinate             86848
Year                         0
Updated On                   0
Latitude                 86848
Longitude                86848
Location                 86848
Month                        0
Day                          0
Hour                         0
Day_of_Week                  0
Day_Name                     0
Month_Name                   0
Quarter                      0
Is_Weekend                   0
dtype: int64

In [39]:
# let make a copy first
new_df= df.copy()

In [40]:
# removing invalid data 
print(f"Data size before cleaning: {len(new_df)}")
new_df.dropna(subset="Location", inplace=True)
print(f"Data size after cleaning: {len(new_df)}")

Data size before cleaning: 7784664
Data size after cleaning: 7697816


In [41]:
# removing duplicate data 
print(f"Data size before cleaning: {len(new_df)}")
new_df.drop_duplicates()
print(f"Data size after cleaning: {len(new_df)}")

Data size before cleaning: 7697816
Data size after cleaning: 7697816


In [42]:
# removing data outrange of Chicago distreet
print(f"Data size before cleaning: {len(new_df)}")
cleaned_df= new_df[((new_df["Latitude"]).between(41.6, 42.1)) &
                ((new_df["Longitude"]).between(-87.9, -87.5))
                ]
cleaned_df.reset_index()
print(f"After removing invalid coordinates: {len(cleaned_df)} removed:({len(new_df) - len(cleaned_df)})")


Data size before cleaning: 7697816
After removing invalid coordinates: 7671877 removed:(25939)


In [43]:
# Handle outliers in district/ward
ncleaned_df = cleaned_df[
    (cleaned_df['District'].between(1, 25)) & 
    (cleaned_df['Ward'].between(1, 50))
]
print(f"After removing District and ward outliers: {len(ncleaned_df)} removed:({len(cleaned_df) - len(ncleaned_df)})")


After removing District and ward outliers: 7068425 removed:(603452)


In [44]:
# let  Standardize text fields 'Primary Type' and 'Location Description' for easy retrieval without missing out data
ncleaned_df['Primary Type']=ncleaned_df['Primary Type'].str.upper()
ncleaned_df['Location Description']=ncleaned_df['Location Description'].str.upper()

In [45]:
ncleaned_df['Location Description'].head()

0    RESIDENCE
1      CTA BUS
3     SIDEWALK
4    APARTMENT
5    RESIDENCE
Name: Location Description, dtype: object

In [46]:
ncleaned_df['Primary Type'].head()

0      BATTERY
1        THEFT
3    NARCOTICS
4      ASSAULT
5     BURGLARY
Name: Primary Type, dtype: object

In [47]:
# verifying the data
# checking the missing data
 # Missing critical fields
critical_missing = ncleaned_df[['Date', 'Primary Type', 'Latitude', 'Longitude']].isnull().sum()
print(f"Missing Critical Fields:\n{critical_missing}")

Missing Critical Fields:
Date            0
Primary Type    0
Latitude        0
Longitude       0
dtype: int64


### Export the cleaned data

In [48]:
ncleaned_df.to_csv("../data/Cleaned_data.csv", index=False)

## Descriptive Data Analysis(DDA)

#### Some insight to look at
1.  Major type of crime (show to top 10)
2.  Crime per days of the week
3.  crime by months of the years
4.  

In [49]:
ncleaned_df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Longitude,Location,Month,Day,Hour,Day_of_Week,Day_Name,Month_Name,Quarter,Is_Weekend
0,10224738,HY411648,2015-09-05 13:30:00,043XX S WOOD ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,...,-87.669998,"(41.815117282, -87.669999562)",9,5,13,5,Saturday,September,3,True
1,10224739,HY411615,2015-09-04 11:30:00,008XX N CENTRAL AVE,870,THEFT,POCKET-PICKING,CTA BUS,False,False,...,-87.765404,"(41.895080471, -87.765400451)",9,4,11,4,Friday,September,3,False
3,10224740,HY411595,2015-09-05 12:45:00,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,...,-87.716652,"(41.937405765, -87.716649687)",9,5,12,5,Saturday,September,3,True
4,10224741,HY411610,2015-09-05 13:00:00,0000X N LARAMIE AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,...,-87.755119,"(41.881903443, -87.755121152)",9,5,13,5,Saturday,September,3,True
5,10224742,HY411435,2015-09-05 10:55:00,082XX S LOOMIS BLVD,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,...,-87.658432,"(41.744378879, -87.658430635)",9,5,10,5,Saturday,September,3,True


## Visualization