In [1]:
# Add Matplotlib inline magic command
%matplotlib inline
%matplotlib notebook
# Dependencies and Setup
import os
import pandas as pd
from datetime import datetime as dt
import matplotlib.pyplot as plt
# Import the style from Matplotlib.
from matplotlib import style
# File to Load
lapd_arrests_data_to_load = os.path.join('..', 'resources', 'OLD', 'Los_Angeles_PD_Arrest_Data_2020_to_Present.csv')

In [2]:
# Reading Los_Angeles_PD_Arrest_Data_2020_to_Present.csv into a dataframe
lapd_arrests_data_df = pd.read_csv(lapd_arrests_data_to_load, encoding='cp1252', converters={'Time': str})
lapd_arrests_data_df.head(1)

Unnamed: 0,Report ID,Report Type,Arrest Date,Time,Area ID,Area Name,Reporting District,Age,Sex Code,Descent Code,...,Disposition Description,Address,Cross Street,LAT,LON,Location,Booking Date,Booking Time,Booking Location,Booking Location Code
0,211517634,RFC,11/16/2021 12:00:00 AM,830,15,N Hollywood,1535,57,M,W,...,MISDEMEANOR COMPLAINT FILED,LANKERSHIM,CALIFA,34.1775,-118.3822,POINT (-118.3822 34.1775),,,,


In [3]:
# Checking NaNs and column Dtype
lapd_arrests_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198736 entries, 0 to 198735
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Report ID                 198736 non-null  int64  
 1   Report Type               198736 non-null  object 
 2   Arrest Date               198736 non-null  object 
 3   Time                      198736 non-null  object 
 4   Area ID                   198736 non-null  int64  
 5   Area Name                 198736 non-null  object 
 6   Reporting District        198736 non-null  int64  
 7   Age                       198736 non-null  int64  
 8   Sex Code                  198736 non-null  object 
 9   Descent Code              198736 non-null  object 
 10  Charge Group Code         182953 non-null  float64
 11  Charge Group Description  182922 non-null  object 
 12  Arrest Type Code          198735 non-null  object 
 13  Charge                    198736 non-null  o

In [4]:
# Formated object 'Arrest Date' datetime into year/month/day
lapd_arrests_data_df['Arrest Date'] = pd.to_datetime(lapd_arrests_data_df['Arrest Date'], 
                                                     errors='coerce').dt.strftime('%Y/%m/%d')
#lapd_arrests_data_df['Arrest Date'] = lapd_arrests_data_df['Arrest Date'].dt.strftime('%Y/%m/%d')
lapd_arrests_data_df['Arrest Date']

0         2021/11/16
1         2021/09/01
2         2022/03/17
3         2021/09/20
4         2021/07/27
             ...    
198731    2022/10/25
198732    2022/11/30
198733    2022/10/03
198734    2022/09/26
198735    2022/10/05
Name: Arrest Date, Length: 198736, dtype: object

In [5]:
# Formated object 'Time' datetime into hour:minute
lapd_arrests_data_df['Time'] = pd.to_datetime(lapd_arrests_data_df['Time'],
                                                   errors='coerce').dt.strftime('%H:%M')
lapd_arrests_data_df['Time']

0           NaN
1           NaN
2           NaN
3         00:00
4           NaN
          ...  
198731      NaN
198732      NaN
198733      NaN
198734      NaN
198735      NaN
Name: Time, Length: 198736, dtype: object

In [6]:
# Added new series to DataFrame that combines objects ['Arrest Date'] and ['Time'] into datetime64 ['Report Datetime']
lapd_arrests_data_df['Report Datetime'] = lapd_arrests_data_df['Arrest Date'] + ' ' + lapd_arrests_data_df['Time']
lapd_arrests_data_df['Report Datetime'] = pd.to_datetime(lapd_arrests_data_df['Report Datetime'],
                                                         errors='coerce',
                                                         dayfirst=True,)
# Code bellow format series and coverts it into string(dont really want that for now)
#lapd_arrests_data_df['Datetime'] = lapd_arrests_data_df['Report Datetime'].dt.strftime('%d/%m/%Y %H:%M')
lapd_arrests_data_df['Report Datetime']

0               NaT
1               NaT
2               NaT
3        2021-09-20
4               NaT
            ...    
198731          NaT
198732          NaT
198733          NaT
198734          NaT
198735          NaT
Name: Report Datetime, Length: 198736, dtype: datetime64[ns]

In [7]:
# Added new series to DataFrame by using list comprehension and zipping ['LAT'] and ['LON'] into standard coordinate order ['Coordinates']
#x = [f'({a},{b})' for a, b in zip(df["a"], df["b"])]
lapd_arrests_data_df['Coordinates'] = [f'({a},{b})' for a, b in zip(lapd_arrests_data_df['LAT'], lapd_arrests_data_df['LON'])]
lapd_arrests_data_df['Coordinates']

0         (34.1775,-118.3822)
1         (33.9996,-118.2915)
2          (34.2037,-118.421)
3                   (0.0,0.0)
4         (34.2388,-118.4677)
                 ...         
198731    (33.9345,-118.2623)
198732    (34.0601,-118.2761)
198733    (34.0273,-118.3622)
198734    (34.2576,-118.5067)
198735     (34.0957,-118.331)
Name: Coordinates, Length: 198736, dtype: object

In [8]:
# Created sorted DataFrame that contains all reports from 2021, also only picked the columns needed for analysis
lapd_arrests_data_2021_df = lapd_arrests_data_df.loc[(lapd_arrests_data_df['Report Datetime']>='2021/01/01') & 
                                                     (lapd_arrests_data_df['Report Datetime']<='2021/12/31'), 
                                                     ['Report ID', 'Report Datetime', 'Age',
                                                      'Sex Code', 'Charge Group Description', 'Charge Description', 
                                                      'Address', 'Cross Street', 'LAT', 'LON', 'Coordinates']].sort_values(by='Report Datetime').copy()
lapd_arrests_data_2021_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22895 entries, 69575 to 174124
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Report ID                 22895 non-null  int64         
 1   Report Datetime           22895 non-null  datetime64[ns]
 2   Age                       22895 non-null  int64         
 3   Sex Code                  22895 non-null  object        
 4   Charge Group Description  21508 non-null  object        
 5   Charge Description        21510 non-null  object        
 6   Address                   22895 non-null  object        
 7   Cross Street              12730 non-null  object        
 8   LAT                       22895 non-null  float64       
 9   LON                       22895 non-null  float64       
 10  Coordinates               22895 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(6)
memory usage: 2.1+ MB

In [9]:
# Renaming columns to make more uniform
lapd_arrests_data_2021_df = lapd_arrests_data_2021_df.rename(columns={'Sex Code':'Sex',
                                                                     'Charge Group Description':'Charge Group',
                                                                     'Cross Street':'Street',
                                                                     'LAT':'Lat',
                                                                     'LON':'Lon'})

In [10]:
lapd_arrests_data_2021_df.dropna(subset=['Charge Group','Charge Description'], thresh=2, inplace=True)

In [11]:
# Reset index to 0 and drop old index column
lapd_arrests_data_2021_df.reset_index(drop=True, inplace=True)
lapd_arrests_data_2021_df.index.values


array([    0,     1,     2, ..., 21505, 21506, 21507], dtype=int64)

In [12]:
# Double checking NaNs and column Dtype
lapd_arrests_data_2021_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21508 entries, 0 to 21507
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Report ID           21508 non-null  int64         
 1   Report Datetime     21508 non-null  datetime64[ns]
 2   Age                 21508 non-null  int64         
 3   Sex                 21508 non-null  object        
 4   Charge Group        21508 non-null  object        
 5   Charge Description  21508 non-null  object        
 6   Address             21508 non-null  object        
 7   Street              12037 non-null  object        
 8   Lat                 21508 non-null  float64       
 9   Lon                 21508 non-null  float64       
 10  Coordinates         21508 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(6)
memory usage: 1.8+ MB


In [13]:
# Print final DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
lapd_arrests_data_2021_df.head(500)

Unnamed: 0,Report ID,Report Datetime,Age,Sex,Charge Group,Charge Description,Address,Street,Lat,Lon,Coordinates
0,6092639,2021-01-01,37,M,Other Assaults,BATTERY ON PERSON,MAIN,7TH,34.0491,-118.2558,"(34.0491,-118.2558)"
1,6092783,2021-01-01,50,F,Fraud/Embezzlement,EMBEZZLEMENT OF PROPERTY BY EMPLOYEE,19500 NORMANDIE AV,,33.8535,-118.2992,"(33.8535,-118.2992)"
2,6093021,2021-01-01,30,M,Miscellaneous Other Violations,LOS ANGELES MUNICIPAL CODE,WASHINGTON BL,RIMPAU BL,34.0398,-118.3435,"(34.0398,-118.3435)"
3,6092800,2021-01-01,54,M,Aggravated Assault,CORPORAL INJURY ON SPOUSE/COHABITANT/ETC,2900 9TH AV,,34.0277,-118.3272,"(34.0277,-118.3272)"
4,6092894,2021-01-01,55,F,Aggravated Assault,"ADW, NOT FIREARM, W/GBI",300 LOMA DR,,34.0595,-118.266,"(34.0595,-118.266)"
5,6092623,2021-01-01,18,F,Aggravated Assault,CORPORAL INJURY ON SPOUSE/COHABITANT/ETC,300 S NORMANDIE AV,,34.0672,-118.3047,"(34.0672,-118.3047)"
6,6092700,2021-01-01,52,M,Narcotic Drug Laws,POSSESS/PURCHASE CONTROLLED SUBS FOR SALE,GARTH,BEVERLYWOOD BL,34.0425,-118.3818,"(34.0425,-118.3818)"
7,6092714,2021-01-01,42,M,Miscellaneous Other Violations,VANDALISM W/LOSS VALUED EQ OR > $400,400 W 84TH ST,,33.9632,-118.2827,"(33.9632,-118.2827)"
8,6092599,2021-01-01,31,M,Narcotic Drug Laws,POSSESSION CONTROLLED SUBSTANCE,SAN PEDRO,49TH ST,33.9988,-118.2696,"(33.9988,-118.2696)"
9,212104044,2021-01-01,37,M,Miscellaneous Other Violations,TRESPASSING LANDS UNDER CULTIVATION,6300 VARIEL AV,,0.0,0.0,"(0.0,0.0)"


In [14]:
Los_Angeles_PD_Arrests_Data_2021_clean_csv = os.path.join('..', 'resources', 'Cleaned' 'Los_Angeles_PD_Arrest_Data_2021_clean.csv')
lapd_arrests_data_2021_df.to_csv(Los_Angeles_PD_Arrests_Data_2021_clean_csv, index=False, encoding='ISO=8859-1')