In [69]:
#Import the pandas library for data analysis
import pandas as pd

#read the dataset into pandas
df = pd.read_csv('crime_dataset.csv');

In [70]:
#Inspect the dataframe
df.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124833 entries, 0 to 124832
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id                   124833 non-null  int64 
 1   d_Date               124833 non-null  object
 2   b_Block              124833 non-null  object
 3   PrimaryType          124833 non-null  object
 4   des_Description      124833 non-null  object
 5   LocationDescription  124333 non-null  object
 6   Arrest               124833 non-null  bool  
 7   District             124833 non-null  int64 
 8   Ward                 124833 non-null  int64 
 9   Community Area       124833 non-null  int64 
 10  FBICode              124833 non-null  object
dtypes: bool(1), int64(4), object(6)
memory usage: 9.6+ MB


In [71]:
#Inspect the content of date column for conversion
df['d_Date'].head()

0    07/01/2024 12:00:00 AM
1    07/01/2024 12:00:00 AM
2    07/01/2024 12:00:00 AM
3    07/01/2024 12:00:00 AM
4    07/01/2024 12:00:00 AM
Name: d_Date, dtype: object

In [72]:
# Convert 'd_Date' to datetime and select the format. Throw error handler to catch invalid parsings
df['d_Date'] = pd.to_datetime(df['d_Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

In [73]:
#Validate the conversion
print(df['d_Date'].head())

0   2024-07-01
1   2024-07-01
2   2024-07-01
3   2024-07-01
4   2024-07-01
Name: d_Date, dtype: datetime64[ns]


In [74]:
#Create a new column called 'time', and store in it the extracted time values from the 'd_Date' column but convert them to 24hour standard first
df['time'] = df['d_Date'].dt.strftime('%H:%M:%S')

In [75]:
# Inspect the new time column
df['time']

0         00:00:00
1         00:00:00
2         00:00:00
3         00:00:00
4         00:00:00
            ...   
124828    00:00:00
124829    00:00:00
124830    00:00:00
124831    00:00:00
124832    00:00:00
Name: time, Length: 124833, dtype: object

In [76]:
#Checking the entire dataframe
df

Unnamed: 0,id,d_Date,b_Block,PrimaryType,des_Description,LocationDescription,Arrest,District,Ward,Community Area,FBICode,time
0,13513629,2024-07-01,009XX W BELMONT AVE,BATTERY,"AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...",SIDEWALK,True,19,44,6,08B,00:00:00
1,13515437,2024-07-01,008XX W BUCKINGHAM PL,THEFT,OVER $500,STREET,False,19,44,6,6,00:00:00
2,13519103,2024-07-01,091XX S NORMAL AVE,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,22,9,73,11,00:00:00
3,13513767,2024-07-01,021XX W 73RD ST,ROBBERY,ARMED - KNIFE / CUTTING INSTRUMENT,STREET,False,7,17,67,3,00:00:00
4,13514618,2024-07-01,053XX W WASHINGTON BLVD,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,VACANT LOT / LAND,False,15,37,25,15,00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
124828,13345615,2024-01-01,0000X E 112TH PL,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT / GARAGE (NON RESIDENTIAL),False,5,9,49,14,00:00:00
124829,13406628,2024-01-01,085XX S DAMEN AVE,OTHER OFFENSE,TELEPHONE THREAT,OTHER (SPECIFY),False,6,18,71,08A,00:00:00
124830,13446569,2024-01-01,064XX N RIDGE BLVD,THEFT,FROM BUILDING,NURSING / RETIREMENT HOME,False,24,50,2,6,00:00:00
124831,13325472,2024-01-01,021XX N LECLAIRE AVE,CRIMINAL DAMAGE,TO VEHICLE,RESIDENCE,False,25,26,19,14,00:00:00


In [77]:
#Inspecting the datafram attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124833 entries, 0 to 124832
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   id                   124833 non-null  int64         
 1   d_Date               124833 non-null  datetime64[ns]
 2   b_Block              124833 non-null  object        
 3   PrimaryType          124833 non-null  object        
 4   des_Description      124833 non-null  object        
 5   LocationDescription  124333 non-null  object        
 6   Arrest               124833 non-null  bool          
 7   District             124833 non-null  int64         
 8   Ward                 124833 non-null  int64         
 9   Community Area       124833 non-null  int64         
 10  FBICode              124833 non-null  object        
 11  time                 124833 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(4), object(6)
memory usage: 10.6+ M

In [78]:
#Just wanted a copy before removing missing values
df.to_csv('./cleaned_timedate.csv')

In [79]:
#Extracting the date elements only and eliminating the time aspect and keeping that in the 'd_Date' column
df['d_Date'] = df['d_Date'].dt.date

In [80]:
#Code to check for missing values
df.isnull().sum()

id                       0
d_Date                   0
b_Block                  0
PrimaryType              0
des_Description          0
LocationDescription    500
Arrest                   0
District                 0
Ward                     0
Community Area           0
FBICode                  0
time                     0
dtype: int64

In [81]:
#Eliminate all missing values
df_cleaned = df.dropna()

In [82]:
#Confirm they have been eliminated
df_cleaned.isnull().sum()

id                     0
d_Date                 0
b_Block                0
PrimaryType            0
des_Description        0
LocationDescription    0
Arrest                 0
District               0
Ward                   0
Community Area         0
FBICode                0
time                   0
dtype: int64

In [84]:
#Download the cleaned dataframe
df.to_csv('./clean_crimedata.csv')