# Initial Overview and Cleaning

## Imports

In [26]:
import pandas as pd
import glob 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import matplotlib.ticker as mticks
from sklearn.compose import ColumnTransformer

## Functions

## Importing Data & Initial Inspection

In [27]:
# Using glob to get all filepaths that match the pattern (*=wildcard)
glob = sorted(glob.glob("Data/Chicago-Crime_2*.csv"))
glob

['Data\\Chicago-Crime_2001.csv',
 'Data\\Chicago-Crime_2002.csv',
 'Data\\Chicago-Crime_2003.csv',
 'Data\\Chicago-Crime_2004.csv',
 'Data\\Chicago-Crime_2005.csv',
 'Data\\Chicago-Crime_2006.csv',
 'Data\\Chicago-Crime_2007.csv',
 'Data\\Chicago-Crime_2008.csv',
 'Data\\Chicago-Crime_2009.csv',
 'Data\\Chicago-Crime_2010.csv',
 'Data\\Chicago-Crime_2011.csv',
 'Data\\Chicago-Crime_2012.csv',
 'Data\\Chicago-Crime_2013.csv',
 'Data\\Chicago-Crime_2014.csv',
 'Data\\Chicago-Crime_2015.csv',
 'Data\\Chicago-Crime_2016.csv',
 'Data\\Chicago-Crime_2017.csv',
 'Data\\Chicago-Crime_2018.csv',
 'Data\\Chicago-Crime_2019.csv',
 'Data\\Chicago-Crime_2020.csv',
 'Data\\Chicago-Crime_2021.csv',
 'Data\\Chicago-Crime_2022.csv']

In [10]:
# Using read_csv in a list comprehension and combine with concact to load all files
ch_dirt = pd.concat([pd.read_csv(f) for f in glob])

In [59]:
ch_dirt.head()

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,1326041,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE,False,False,1624,16.0,,41.95785,-87.749185
1,1319931,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,True,825,8.0,,41.783892,-87.684841
2,1324743,01/01/2001 01:00:00 PM,GAMBLING,ILLEGAL ILL LOTTERY,STREET,True,False,313,3.0,,41.780412,-87.61197
3,1310717,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2424,24.0,,42.012391,-87.678032
4,1318099,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,214,2.0,,41.819538,-87.62002


In [13]:
chi_dirt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7713109 entries, 0 to 238857
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Date                  object 
 2   Primary Type          object 
 3   Description           object 
 4   Location Description  object 
 5   Arrest                bool   
 6   Domestic              bool   
 7   Beat                  int64  
 8   District              float64
 9   Ward                  float64
 10  Latitude              float64
 11  Longitude             float64
dtypes: bool(2), float64(4), int64(2), object(4)
memory usage: 662.0+ MB


## Performing basic EDA on the dataframe

In [14]:
# inspecting data types for an unexpected data types
ch_dirt.dtypes

ID                        int64
Date                     object
Primary Type             object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
Beat                      int64
District                float64
Ward                    float64
Latitude                float64
Longitude               float64
dtype: object

All data types make sense for the category

In [16]:
# Checking for any potential dupliacted rows
ch_dirt.duplicated().sum()

0

In [19]:
# Checking for the amount of null values in each collumn
ch_dirt.isna().sum()

ID                           0
Date                         0
Primary Type                 0
Description                  0
Location Description     10928
Arrest                       0
Domestic                     0
Beat                         0
District                    47
Ward                    614846
Latitude                 88685
Longitude                88685
dtype: int64

Many reports are missing location information. Could use district information rather than exact coordinates.

### Inspecting Categorical Columns for any inconsistencies

In [39]:
# Identifying string columns 
string_cols = ch_dirt.select_dtypes('object').columns

In [40]:
# Checking the number of unique values in each categorical column 
ch_dirt[string_cols].nunique()

Date                    3172250
Primary Type                 36
Description                 547
Location Description        215
dtype: int64

In [41]:
# Dropping the Date column as there are too many values
string_cols = string_cols.drop('Date')

In [42]:
# Obtaining value counts for all string columns
for col in string_cols:
    print(f'Value Counts for {col}')
    print(ch_dirt[col].value_counts())
    # Adding extra line for readability
    print('\n')

Value Counts for Primary Type
THEFT                                1626992
BATTERY                              1410889
CRIMINAL DAMAGE                       878914
NARCOTICS                             746155
ASSAULT                               501103
OTHER OFFENSE                         479174
BURGLARY                              422246
MOTOR VEHICLE THEFT                   366586
DECEPTIVE PRACTICE                    340958
ROBBERY                               289677
CRIMINAL TRESPASS                     212869
WEAPONS VIOLATION                     103906
PROSTITUTION                           69750
OFFENSE INVOLVING CHILDREN             55280
PUBLIC PEACE VIOLATION                 52086
SEX OFFENSE                            30442
CRIM SEXUAL ASSAULT                    27584
INTERFERENCE WITH PUBLIC OFFICER       18206
LIQUOR LAW VIOLATION                   14846
GAMBLING                               14616
ARSON                                  13121
HOMICIDE                 

Some things to note. 'Primary Type' has some inconsistencies on how 'non-criminal' is recorded, I will be leaving 'NON-CRIMINAL (SUBJECT SPECIFIED)' as is. Also will replace 'CRIMINAL SEXUAL ASSAULT' with 'CRIM SEXUAL ASSAULT' is the latter is more common. Too many entries for both 'Description' and 'Location Description' to properly look though.

In [71]:
# Setting all 'NON - CRIMINAL' values to ''NON-CRIMINAL'
ch_dirt['Primary Type'] = ch_dirt['Primary Type'].replace({'NON - CRIMINAL': 'NON-CRIMINAL', 'CRIMINAL SEXUAL ASSAULT': 'CRIM SEXUAL ASSAULT'})
# Ensuring the changes went through
ch_dirt['Primary Type'].value_counts()

THEFT                                1626992
BATTERY                              1410889
CRIMINAL DAMAGE                       878914
NARCOTICS                             746155
ASSAULT                               501103
OTHER OFFENSE                         479174
BURGLARY                              422246
MOTOR VEHICLE THEFT                   366586
DECEPTIVE PRACTICE                    340958
ROBBERY                               289677
CRIMINAL TRESPASS                     212869
WEAPONS VIOLATION                     103906
PROSTITUTION                           69750
OFFENSE INVOLVING CHILDREN             55280
PUBLIC PEACE VIOLATION                 52086
CRIM SEXUAL ASSAULT                    33937
SEX OFFENSE                            30442
INTERFERENCE WITH PUBLIC OFFICER       18206
LIQUOR LAW VIOLATION                   14846
GAMBLING                               14616
ARSON                                  13121
HOMICIDE                               12394
KIDNAPPING

In [62]:
# Removing the maximum number of displayed rows so I can view all valuecounts
ch_dirt.set_option('display.max_rows', None)

In [63]:
# Obtaining value counts for all string columns
for col in string_cols:
    print(f'Value Counts for {col}')
    print(ch_dirt[col].value_counts())
    # Adding extra line for readability
    print('\n')

Value Counts for Primary Type
THEFT                                1626992
BATTERY                              1410889
CRIMINAL DAMAGE                       878914
NARCOTICS                             746155
ASSAULT                               501103
OTHER OFFENSE                         479174
BURGLARY                              422246
MOTOR VEHICLE THEFT                   366586
DECEPTIVE PRACTICE                    340958
ROBBERY                               289677
CRIMINAL TRESPASS                     212869
WEAPONS VIOLATION                     103906
PROSTITUTION                           69750
OFFENSE INVOLVING CHILDREN             55280
PUBLIC PEACE VIOLATION                 52086
CRIM SEXUAL ASSAULT                    33937
SEX OFFENSE                            30442
INTERFERENCE WITH PUBLIC OFFICER       18206
LIQUOR LAW VIOLATION                   14846
GAMBLING                               14616
ARSON                                  13121
HOMICIDE                 

There are far too many errors for me to possibly catch or dream to fix, and due to the cardinality being so large, I will not be able to make use of this category eiter way.

In [81]:
# Fixing some inconsistencies in the 'Location Description' column
ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['TAVERN / LIQUOR STORE', 'TAVERN','LIQUOR STORE','BAR', 
                                                                           'BAR OR TAVERN'], 'TAVERN/LIQUOR STORE')
ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['HOTEL / MOTEL', 'HOTEL', 'MOTEL'], 'HOTEL/MOTEL')
ch_dirt['Location Description'] = ch_dirt['Location Description'].replace({'RESIDENCE - GARAGE' : 'RESIDENCE-GARAGE',
                                                                           'OTHER (SPECIFY)' : 'OTHER',
                                                                           'SCHOOL, PRIVATE, GROUNDS' : 'SCHOOL - PRIVATE GROUNDS',
                                                                           'MOVIE HOUSE / THEATER' : 'MOVIE HOUSE/THEATER',
                                                                           'BOAT / WATERCRAFT' : 'BOAT/WATERCRAFT',
                                                                           'TAXICAB' : 'TAXI CAB',
                                                                           'POOLROOM': 'POOL ROOM'})
                                                                          
# Ensuring the changes went through
ch_dirt['Location Description'].value_counts()

STREET                                                   2005166
RESIDENCE                                                1297277
APARTMENT                                                 865242
SIDEWALK                                                  725870
OTHER                                                     282114
PARKING LOT/GARAGE(NON.RESID.)                            202977
ALLEY                                                     171241
SCHOOL, PUBLIC, BUILDING                                  146378
SMALL RETAIL STORE                                        145207
RESIDENCE-GARAGE                                          143239
RESTAURANT                                                125737
RESIDENCE PORCH/HALLWAY                                   124202
VEHICLE NON-COMMERCIAL                                    123084
GROCERY FOOD STORE                                         97969
DEPARTMENT STORE                                           97483
GAS STATION              

In [85]:
# Fixing some inconsistencies in the 'Location Description' column
ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA',
                                                                           'AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA',
                                                                           'AIRPORT EXTERIOR - NON-SECURE ARE',
                                                                           'AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA',
                                                                           'AIRPORT TERMINAL UPPER LEVEL - SECURE AREA',
                                                                          'AIRPORT TERMINAL LOWER LEVEL - SECURE AREA',
                                                                          'AIRPORT BUILDING NON-TERMINAL - SECURE AREA',
                                                                          'AIRPORT EXTERIOR - SECURE AREA',
                                                                          'AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA',
                                                                           'AIRPORT PARKING LOT',
                                                                           'AIRPORT EXTERIOR - NON-SECURE AREA',
                                                                           'AIRPORT VENDING ESTABLISHMENT '
                                                                          'AIRCRAFT',
                                                                          'AIRPORT'],
                                                                          'AIRPORT/AIRCRAFT')

ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['SCHOOL - PRIVATE GROUNDS', 
                                                                           'SCHOOL - PRIVATE BUILDING',    
                                                                           'SCHOOL, PUBLIC, BUILDING', 
                                                                           'SCHOOL - PUBLIC BUILDING',
                                                                          'SCHOOL - PUBLIC GROUNDS',
                                                                           'SCHOOL, PUBLIC, GROUNDS',
                                                                           'SCHOOL, PRIVATE, BUILDING',
                                                                          'SCHOOL YARD',
                                                                          'PUBLIC GRAMMAR SCHOOL',
                                                                           'PUBLIC HIGH SCHOOL'], 'SCHOOL')

ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['CHA HALLWAY / STAIRWELL / ELEVATOR', 
                                                                           'CHA STAIRWELL',    
                                                                           'STAIRWELL', 
                                                                           'CHA HALLWAY',
                                                                          'CHA ELEVATOR',
                                                                          'ELEVATOR',
                                                                          'HALLWAY / STAIRWELL / ELEVATOR',
                                                                           'CHA HALLWAY/STAIRWELL/ELEVATOR'], 'HALLWAY/STAIRWELL/ELEVATOR')

ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['RESIDENCE - PORCH / HALLWAY', 
                                                                           'RESIDENCE - YARD (FRONT / BACK)',    
                                                                           'RESIDENCE-GARAGE', 
                                                                           'RESIDENTIAL YARD (FRONT/BACK)',
                                                                          'DRIVEWAY - RESIDENTIAL',
                                                                          'HOUSE',
                                                                          'COACH HOUSE'
                                                                           ], 'RESIDENCE')

ch_dirt['Location Description'] = ch_dirt['Location Description'].replace(['CHURCH / SYNAGOGUE / PLACE OF WORSHIP', 
                                                                           'CHURCH PROPERTY',    
                                                                           'CHURCH' 
                                                                           ], 'CHURCH/SYNAGOGUE/PLACE OF WORSHIP')

ch_dirt['Location Description'] = ch_dirt['Location Description'].replace({'VACANT LOT / LAND' : 'VACANT LOT/LAND',
                                                                           'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)' : 
                                                                           'VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)',
                                                                           'SCHOOL, PRIVATE, GROUNDS' : 'SCHOOL - PRIVATE GROUNDS',
                                                                           'VEHICLE - COMMERCIAL: TROLLEY BUS' : 
                                                                           'VEHICLE-COMMERCIAL - TROLLEY BUS',
                                                                          'VEHICLE - COMMERCIAL: ENTERTAINMENT / PARTY BUS' :
                                                                          'VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS',
                                                                          'POLICE FACILITY/VEH PARKING LOT' : 'POLICE FACILITY',
                                                                          'POLICE FACILITY / VEHICLE PARKING LOT' : 'POLICE FACILITY',
                                                                           'VEHICLE - OTHER RIDE SERVICE' : 
                                                                          'VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)',
                                                                          'COLLEGE/UNIVERSITY GROUNDS' : 'COLLEGE/UNIVERSITY',
                                                                          'COLLEGE/UNIVERSITY RESIDENCE HALL' : 'COLLEGE/UNIVERSITY',
                                                                           'HOSPITAL BUILDING / GROUNDS' : 'HOSPITAL BUILDING/GROUNDS',
                                                                           'NURSING / RETIREMENT HOME' : 'NURSING HOME/RETIREMENT HOME',
                                                                           'NURSING HOME' : 'NURSING HOME/RETIREMENT HOME'
                                                                          })
                                                                          
# Ensuring the changes went through
ch_dirt['Location Description'].value_counts()

STREET                                                   2005166
RESIDENCE                                                1556808
APARTMENT                                                 865242
SIDEWALK                                                  725870
OTHER                                                     282114
PARKING LOT/GARAGE(NON.RESID.)                            202977
SCHOOL                                                    201955
ALLEY                                                     171241
SMALL RETAIL STORE                                        145207
RESTAURANT                                                125737
RESIDENCE PORCH/HALLWAY                                   124202
VEHICLE NON-COMMERCIAL                                    123084
GROCERY FOOD STORE                                         97969
DEPARTMENT STORE                                           97483
GAS STATION                                                85667
TAVERN/LIQUOR STORE      

COMMERCIAL / BUSINESS OFFICE 

TRUCK

DRIVEWAY - RESIDENTIAL 

ATM (AUTOMATIC TELLER MACHINE)

CHA APARTMENT         APARTMENT

CLUB

CTA TRAIN


MEDICAL / DENTAL OFFICE

COUNTY JAIL

JUNK YARD/GARBAGE DUMP


VACANT LOT

# TODO NEXT: 
## Remove 'CHA' and 'CTA from every entry'

In [55]:
# Saving a list of numerical columns
num_cols = ch_dirt.select_dtypes('number').columns
num_cols = num_cols.drop('ID')

In [58]:
# Using .describe to view the statistics of the data
for col in num_cols:# printing a description of each column
    print(f'Value counts for {col}')
    # .apply() being used to suppress scientific notation
    print(ch_dirt[col].describe().apply(lambda x : format(x, 'f')))
    # Print an empty line for readability
    print('\n')
    

Value counts for Beat
count    7713109.000000
mean        1186.293871
std          703.064759
min          111.000000
25%          621.000000
50%         1034.000000
75%         1731.000000
max         2535.000000
Name: Beat, dtype: object


Value counts for District
count    7713062.000000
mean          11.295328
std            6.951157
min            1.000000
25%            6.000000
50%           10.000000
75%           17.000000
max           31.000000
Name: District, dtype: object


Value counts for Ward
count    7098263.000000
mean          22.750796
std           13.847996
min            1.000000
25%           10.000000
50%           23.000000
75%           34.000000
max           50.000000
Name: Ward, dtype: object


Value counts for Latitude
count    7624424.000000
mean          41.842167
std            0.088811
min           36.619446
25%           41.768727
50%           41.855888
75%           41.906766
max           42.022910
Name: Latitude, dtype: object


Value counts for

Everything looks correct, no major outliers in either direction. The beat category is a more specific location so higher variance makes sense.