# **Problem Statement**



### **Loading Libraries**

In [2]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from utils.data_loader import load_crime_dataset

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set theme of pyplot library

sns.set_theme(style="ticks", color_codes=True)

### **Downloading/Locating dataset**

In [3]:
# Location of stored dataset
dataset_path = Path('../datasets/chicago-crime-data.csv')

if dataset_path.exists():
    print(f"File found: {dataset_path.name}")
else:
    load_crime_dataset()

File found: chicago-crime-data.csv


### **Loading Dataset**

In [4]:
# Converting result into dataframe
crime_df = pd.read_csv(dataset_path)
print(crime_df.shape)
crime_df.head(4)

(100000, 23)


Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,:@computed_region_awaf_s7ux
0,13641131,JH480792,2024-10-24T00:00:00.000,080XX S VINCENNES AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,True,...,44.0,14,1174857.0,1851682.0,2024,2024-10-31T15:41:20.000,41.748401,-87.634836,"{'latitude': '41.748400979', 'longitude': '-87...",17.0
1,13641183,JH480745,2024-10-24T00:00:00.000,050XX S HOMAN AVE,560,ASSAULT,SIMPLE,SCHOOL - PUBLIC GROUNDS,False,False,...,63.0,08A,1154558.0,1870934.0,2024,2024-10-31T15:41:20.000,41.80166,-87.708707,"{'latitude': '41.801659529', 'longitude': '-87...",49.0
2,13641492,JH481388,2024-10-24T00:00:00.000,019XX N ELSTON AVE,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,APARTMENT,False,False,...,22.0,11,1165676.0,1913009.0,2024,2024-10-31T15:41:20.000,41.916888,-87.666735,"{'latitude': '41.916888176', 'longitude': '-87...",16.0
3,13643453,JH483847,2024-10-24T00:00:00.000,010XX N WOOD ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,24.0,14,1164204.0,1907061.0,2024,2024-10-31T15:41:20.000,41.900598,-87.672311,"{'latitude': '41.900597722', 'longitude': '-87...",24.0


In [5]:
crime_df.columns

Index(['id', 'case_number', 'date', 'block', 'iucr', 'primary_type',
       'description', 'location_description', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'community_area', 'fbi_code', 'x_coordinate',
       'y_coordinate', 'year', 'updated_on', 'latitude', 'longitude',
       'location', ':@computed_region_awaf_s7ux'],
      dtype='object')

In [36]:
# Dropping redundant columns
crime_df.drop(columns=crime_df.columns[22:], axis = 1, inplace= True)

In [37]:
# Check duplicate rows
print(f"Duplicated rows detected: {sum(crime_df.duplicated())}")

Duplicated rows detected: 0


In [38]:
# Check missing values
crime_df.isna().sum()

id                        0
case_number               0
date                      0
block                     0
iucr                      0
primary_type              0
description               0
location_description    310
arrest                    0
domestic                  0
beat                      0
district                  0
ward                      0
community_area            1
fbi_code                  0
x_coordinate             90
y_coordinate             90
year                      0
updated_on                0
latitude                 90
longitude                90
location                 90
dtype: int64

In [39]:
# Dropping rows with missing values
crime_df.dropna(inplace=True)
crime_df.shape

(99602, 22)

## **Seggregate Numerical and Categorical features**

In [40]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99602 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    99602 non-null  int64  
 1   case_number           99602 non-null  object 
 2   date                  99602 non-null  object 
 3   block                 99602 non-null  object 
 4   iucr                  99602 non-null  object 
 5   primary_type          99602 non-null  object 
 6   description           99602 non-null  object 
 7   location_description  99602 non-null  object 
 8   arrest                99602 non-null  bool   
 9   domestic              99602 non-null  bool   
 10  beat                  99602 non-null  int64  
 11  district              99602 non-null  int64  
 12  ward                  99602 non-null  int64  
 13  community_area        99602 non-null  float64
 14  fbi_code              99602 non-null  object 
 15  x_coordinate          99

In [41]:
# Numerical columns
num_col = crime_df.select_dtypes(include=['int64', 'float64']).columns

# Categorical columns
cat_col = crime_df.select_dtypes(include=['object', 'bool']).columns

print(num_col, end='\n\n')
print(cat_col)

Index(['id', 'beat', 'district', 'ward', 'community_area', 'x_coordinate',
       'y_coordinate', 'year', 'latitude', 'longitude'],
      dtype='object')

Index(['case_number', 'date', 'block', 'iucr', 'primary_type', 'description',
       'location_description', 'arrest', 'domestic', 'fbi_code', 'updated_on',
       'location'],
      dtype='object')


In [42]:
crime_df[cat_col].head()

Unnamed: 0,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,fbi_code,updated_on,location
0,JH480792,2024-10-24T00:00:00.000,080XX S VINCENNES AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,True,14,2024-10-31T15:41:20.000,"{'latitude': '41.748400979', 'longitude': '-87..."
1,JH480745,2024-10-24T00:00:00.000,050XX S HOMAN AVE,560,ASSAULT,SIMPLE,SCHOOL - PUBLIC GROUNDS,False,False,08A,2024-10-31T15:41:20.000,"{'latitude': '41.801659529', 'longitude': '-87..."
2,JH481388,2024-10-24T00:00:00.000,019XX N ELSTON AVE,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,APARTMENT,False,False,11,2024-10-31T15:41:20.000,"{'latitude': '41.916888176', 'longitude': '-87..."
3,JH483847,2024-10-24T00:00:00.000,010XX N WOOD ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,14,2024-10-31T15:41:20.000,"{'latitude': '41.900597722', 'longitude': '-87..."
4,JH480637,2024-10-24T00:00:00.000,025XX N NARRAGANSETT AVE,860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,False,False,06,2024-10-31T15:41:20.000,"{'latitude': '41.926287021', 'longitude': '-87..."


- Frequency of Crime over the months
- Frequency of Crime Type (primary_type)
- Frequency of Location where Crime has occured
- Most crime occured amongst the districts (Use geographic map to visualize)
- Most crime occured amongst the beats (1 district = x beats) (Use geographic map to visualize)