## **Chicago Crime Analysis**

### **Data Injection**

In [1]:
# Importing necessary libraries
import pandas as pd

#libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#libraries for time
from datetime import datetime, timedelta

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Loading dataset
data = r"Crimes_-_2001_to_Present.csv"
crime_data = pd.read_csv(data, low_memory=True)

### **Preliminarly Data Analysis (PDA)**

In [4]:
# Checking the overview of the data
print("\n===== The first five rows =====")
crime_data.head()




===== The first five rows =====


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9.0,12.0,61.0,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117,-87.67,"(41.815117282, -87.669999562)"
1,10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15.0,29.0,25.0,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.89508,-87.7654,"(41.895080471, -87.765400451)"
2,11646166,JC213529,09/01/2018 12:01:00 AM,082XX S INGLESIDE AVE,810,THEFT,OVER $500,RESIDENCE,False,True,631,6.0,8.0,44.0,06,,,2018,04/06/2019 04:04:43 PM,,,
3,10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14.0,35.0,21.0,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937406,-87.71665,"(41.937405765, -87.716649687)"
4,10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15.0,28.0,25.0,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903,-87.755121,"(41.881903443, -87.755121152)"


In [5]:
print("\n===== The last five rows =====")
crime_data.tail()


===== The last five rows =====


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
7784659,12847575,JF420478,09/01/2022 05:00:00 AM,005XX W SURF ST,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,1934,19.0,44.0,6.0,26,1172497.0,1919410.0,2022,01/03/2023 03:46:28 PM,41.934305,-87.641485,"(41.934304581, -87.641484982)"
7784660,12847801,JF420319,07/08/2022 12:00:00 AM,114XX S PRAIRIE AVE,1130,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,STREET,False,False,531,5.0,9.0,49.0,11,1179966.0,1828818.0,2022,01/03/2023 03:46:28 PM,41.685544,-87.616813,"(41.685543881, -87.616812541)"
7784661,12847324,JF420102,09/27/2022 11:00:00 AM,023XX E 70TH ST,0810,THEFT,OVER $500,RESIDENCE,False,False,331,3.0,5.0,43.0,6,1193181.0,1859005.0,2022,01/03/2023 03:46:28 PM,41.768068,-87.567453,"(41.768068052, -87.567452932)"
7784662,12847570,JF420427,09/03/2022 10:25:00 AM,052XX W CARMEN AVE,2021,NARCOTICS,POSSESS - BARBITURATES,RESIDENCE - YARD (FRONT / BACK),True,False,1623,16.0,45.0,11.0,18,1140553.0,1933418.0,2022,01/03/2023 03:46:28 PM,41.973391,-87.758535,"(41.973391184, -87.758534512)"
7784663,12840464,JF411839,09/26/2022 07:20:00 PM,0000X N MASON AVE,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,SIDEWALK,True,False,1513,15.0,29.0,25.0,15,1136773.0,1899652.0,2022,01/03/2023 03:46:28 PM,41.880802,-87.773246,"(41.880802263, -87.773245737)"


In [3]:
crime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7784664 entries, 0 to 7784663
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  int64  
 11  District              float64
 12  Ward                  float64
 13  Community Area        float64
 14  FBI Code              object 
 15  X Coordinate          float64
 16  Y Coordinate          float64
 17  Year                  int64  
 18  Updated On            object 
 19  Latitude              float64
 20  Longitude             float64
 21  Location              object 
dtypes: bool(2), float64(7), int64(3), object(1

From the above, it is evident that there are spaces in between the columns name, it will be of best practice to rename them.

In [4]:
# Renaming all the columns
crime_data.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

In [5]:
crime_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7784664 entries, 0 to 7784663
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   case_number           object 
 2   date                  object 
 3   block                 object 
 4   iucr                  object 
 5   primary_type          object 
 6   description           object 
 7   location_description  object 
 8   arrest                bool   
 9   domestic              bool   
 10  beat                  int64  
 11  district              float64
 12  ward                  float64
 13  community_area        float64
 14  fbi_code              object 
 15  x_coordinate          float64
 16  y_coordinate          float64
 17  year                  int64  
 18  updated_on            object 
 19  latitude              float64
 20  longitude             float64
 21  location              object 
dtypes: bool(2), float64(7), int64(3), object(1

From the above `crime_data.info()`, it can be seen that the memory usage is `5.2+ GB` which is not efficient enough. I'll like to reduce the memory usage. 

In [13]:
# Lets write a function to load the full dataset (7M+ records) or just a specified samples.
# The  idea behind this is just to reduce memory usage for the dataset to also load faster.
# Take your time to understand the function, what I did and why I did so.

def load_chicago_crime_data(data, sample_size=None):
    """
    Optimized loading function for Chicago crime data
    """
    # Define data types to reduce memory usage
    dtype_dict = {
        'id': 'Int32',
        'case_number': 'string',
        'iucr': 'category',
        'primary_type': 'category',
        'description': 'category',
        'location_description': 'category',
        'arrest': 'bool',
        'domestic': 'bool',
        'beat': 'Int16',
        'district': 'Int8',
        'ward': 'Int8',
        'community_area': 'Int8',
        'fbi_code': 'category',
        'x_coordinate': 'float32',
        'y_coordinate': 'float32',
        'latitude': 'float32',
        'longitude': 'float32'
    }
    
    # Parse dates during loading
    date_cols = ['Date', 'Updated On']
    
    if sample_size:
        # Load random sample for faster analysis
        df = pd.read_csv(data, 
                        dtype=dtype_dict,
                        parse_dates=date_cols,
                        low_memory=False, keep_default_na=True, nrows = sample_size) # you can use ".sample(n=sample_size)" as well.
    else:
        df = pd.read_csv(data, 
                        dtype=dtype_dict,
                        parse_dates=date_cols,
                        low_memory=False, keep_default_na=True)
    
    return df

# I would load this and comment it out...

In [14]:
df = load_chicago_crime_data(data, sample_size=100000)

In [15]:
df.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
99995,10215495,HY401397,2015-08-28 22:10:00,031XX W LAWRENCE AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,True,1713,17,33.0,14.0,08B,1154394.0,1931719.0,2015,2018-02-10 15:50:01,41.968463,-87.707683,"(41.968462913, -87.707683157)"
99996,10215496,HY401545,2015-08-29 01:36:00,044XX N SAWYER AVE,460,BATTERY,SIMPLE,STREET,False,False,1724,17,33.0,14.0,08B,1153919.0,1929328.0,2015,2018-02-10 15:50:01,41.961911,-87.709494,"(41.961911365, -87.709493728)"
99997,10215497,HY401614,2015-08-29 03:52:00,037XX W LELAND AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,1723,17,39.0,14.0,14,1150607.0,1930973.0,2015,2018-02-10 15:50:01,41.966491,-87.721627,"(41.96649088, -87.721627447)"
99998,10215498,HY401663,2015-08-29 05:10:00,032XX N KIMBALL AVE,880,THEFT,PURSE-SNATCHING,RESTAURANT,False,False,1732,17,35.0,21.0,06,1153154.0,1921288.0,2015,2018-02-10 15:50:01,41.939864,-87.71252,"(41.939864288, -87.712520442)"
99999,10215499,HY401688,2015-08-29 01:00:00,011XX W GARFIELD BLVD,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,712,7,16.0,68.0,07,1169805.0,1868195.0,2015,2018-02-10 15:50:01,41.793826,-87.65287,"(41.793825753, -87.652869806)"


In [6]:
# Function to downcast the DataTypes to the smallest size.
# It will check through each of the columns and downcast the one will `int` and `float`

def downcast_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    """
    Downcast numeric columns in a DataFrame to smaller dtypes 
    (integers -> smallest int, floats -> smallest float).
    Reports memory usage savings if verbose=True.
    
    Parameters used
    ----------
    df : pd.DataFrame
        Input DataFrame.
    verbose : bool, optional
        If True, prints memory usage before/after, by default True.
    
    Returns
    -------
    pd.DataFrame
        DataFrame with numeric columns downcasted.
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2  # This will give us in the value of MB instead of the GB
    
    df_optimized = df.copy()

    for col in df_optimized.select_dtypes(include=["int", "float"]).columns:
        col_type = df_optimized[col].dtype

        if "int" in str(col_type):
            df_optimized[col] = pd.to_numeric(df_optimized[col], downcast="integer")
        elif "float" in str(col_type):
            df_optimized[col] = pd.to_numeric(df_optimized[col], downcast="float")

    end_mem = df_optimized.memory_usage(deep=True).sum() / 1024**2  # MB
    
    if verbose:
        print(f"Memory usage before: {start_mem:.2f} MB")
        print(f"Memory usage after : {end_mem:.2f} MB")
        print(f"Reduced by        : {100 * (start_mem - end_mem) / start_mem:.1f}%")

    return df_optimized


In [7]:
crime_data = downcast_columns(crime_data)

Memory usage before: 5302.92 MB
Memory usage after : 4976.27 MB
Reduced by        : 6.2%


In [8]:
crime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7784664 entries, 0 to 7784663
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int32  
 1   case_number           object 
 2   date                  object 
 3   block                 object 
 4   iucr                  object 
 5   primary_type          object 
 6   description           object 
 7   location_description  object 
 8   arrest                bool   
 9   domestic              bool   
 10  beat                  int16  
 11  district              float32
 12  ward                  float32
 13  community_area        float32
 14  fbi_code              object 
 15  x_coordinate          float32
 16  y_coordinate          float32
 17  year                  int16  
 18  updated_on            object 
 19  latitude              float32
 20  longitude             float32
 21  location              object 
dtypes: bool(2), float32(7), int16(2), int32(1)