# CALIDAD DE DATOS

## 1. IMPORTAMOS PAQUETES Y DATOS

In [37]:
import pandas as pd
import zipfile
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
%config IPCompleter.greedy = True
import warnings
warnings.filterwarnings('ignore')

Definimos la ruta donde ubicamos los datasets para no tener que estar poniéndola en cada ocasión

In [38]:
ruta_principal = 'C:/Users/Oscar/OneDrive - FM4/Escritorio/EVOLVE/Data Science/EVOLVE/Fernando_Costa/Practicas/Mini_Proyecto_EDA/'
carpeta = '999_data/'

Definimos el dataset que vamos a usar

In [39]:
zip_name = 'Crime_Data_from_2020_to_Present.zip'
csv_filename = 'Crime_Data_from_2020_to_Present.csv'

In [40]:
zip_path = ruta_principal + carpeta + zip_name

Cargamos el dataset df

In [41]:
frac = 0.33
chunksize = 100_000  # ajusta según memoria

rows = []

with zipfile.ZipFile(zip_path) as z:
    with z.open(csv_filename) as f:
        for chunk in pd.read_csv(f, chunksize=chunksize):
            rows.append(chunk.sample(frac=frac))
            
df = pd.concat(rows, ignore_index=True)
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,Mocodes,Vict Age,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Weapon Used Cd,Weapon Desc,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,201319771,11/08/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1541,13,Newton,1363,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,1202 0344,76,M,B,218.0,BEAUTY/BARBER SHOP,,,IC,Invest Cont,331.0,,,,700 E 49TH ST,,33.9986,-118.2632
1,200611211,05/29/2020 12:00:00 AM,05/29/2020 12:00:00 AM,1855,6,Hollywood,666,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329 1822,40,F,W,101.0,STREET,,,IC,Invest Cont,740.0,,,,1100 VINE ST,,34.0908,-118.3266
2,201501048,10/30/2020 12:00:00 AM,06/30/2020 12:00:00 AM,1800,15,N Hollywood,1538,1,210,ROBBERY,1402 0344 0400 0387 0334 1414 1822,43,M,H,501.0,SINGLE FAMILY DWELLING,114.0,AIR PISTOL/REVOLVER/RIFLE/BB GUN,AA,Adult Arrest,210.0,998.0,,,ERWIN ST,FULCHER ST,34.1758,-118.3746
3,201416637,09/04/2020 12:00:00 AM,09/03/2020 12:00:00 AM,1200,14,Pacific,1494,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),,0,,,101.0,STREET,,,IC,Invest Cont,420.0,,,,7200 WORLD WAY,,33.9419,-118.4217
4,201712106,08/11/2020 12:00:00 AM,08/11/2020 12:00:00 AM,1000,17,Devonshire,1723,1,310,BURGLARY,1402 0913 0603 0344,42,M,H,501.0,SINGLE FAMILY DWELLING,,,IC,Invest Cont,310.0,998.0,,,20400 GERMAIN ST,,34.2622,-118.579


## 2. COMPROBAMOS LAS CARACTERÍSTICAS DEL DATASET

In [42]:
df.shape

(331647, 28)

Comprobamos que el dataset tiene nulos y nos sirve para trabajar

In [43]:
df.isna().sum().sort_values(ascending=False)

Crm Cd 4          331627
Crm Cd 3          330888
Crm Cd 2          308801
Cross Street      280649
Weapon Used Cd    223461
Weapon Desc       223461
Mocodes            49801
Vict Descent       47511
Vict Sex           47508
Premis Desc          223
Premis Cd              6
Crm Cd 1               2
Vict Age               0
Crm Cd Desc            0
Crm Cd                 0
Part 1-2               0
AREA NAME              0
Rpt Dist No            0
AREA                   0
TIME OCC               0
Date Rptd              0
DATE OCC               0
DR_NO                  0
Status                 0
Status Desc            0
LOCATION               0
LAT                    0
LON                    0
dtype: int64

Comprobamos las dimensiones de la tabla y los valores únicos de DR_NO ya que sospecho que debe ser el índice

In [44]:
dimensiones_df = df.shape
print(f'Dimensiones df: ', dimensiones_df)

Dimensiones df:  (331647, 28)


In [45]:
DR_NO = df.DR_NO.nunique()
print(f'Valores únicos de DR_NO: ', DR_NO)

Valores únicos de DR_NO:  331647


CAMBIOS EN NOMBRES E ÍNDICE: 
- Transformamos a minúsculas los nombres de las columnas
- Ponemos 'dr_no' como index ya que es el identificador del reporte 

In [46]:
df.columns = df.columns.str.replace(' ','_').str.lower()
df.columns = df.columns.str.replace('-','_')
df = df.set_index('dr_no')
df

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
201319771,11/08/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1541,13,Newton,1363,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,1202 0344,76,M,B,218.0,BEAUTY/BARBER SHOP,,,IC,Invest Cont,331.0,,,,700 E 49TH ST,,33.9986,-118.2632
200611211,05/29/2020 12:00:00 AM,05/29/2020 12:00:00 AM,1855,6,Hollywood,666,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329 1822,40,F,W,101.0,STREET,,,IC,Invest Cont,740.0,,,,1100 VINE ST,,34.0908,-118.3266
201501048,10/30/2020 12:00:00 AM,06/30/2020 12:00:00 AM,1800,15,N Hollywood,1538,1,210,ROBBERY,1402 0344 0400 0387 0334 1414 1822,43,M,H,501.0,SINGLE FAMILY DWELLING,114.0,AIR PISTOL/REVOLVER/RIFLE/BB GUN,AA,Adult Arrest,210.0,998.0,,,ERWIN ST,FULCHER ST,34.1758,-118.3746
201416637,09/04/2020 12:00:00 AM,09/03/2020 12:00:00 AM,1200,14,Pacific,1494,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),,0,,,101.0,STREET,,,IC,Invest Cont,420.0,,,,7200 WORLD WAY,,33.9419,-118.4217
201712106,08/11/2020 12:00:00 AM,08/11/2020 12:00:00 AM,1000,17,Devonshire,1723,1,310,BURGLARY,1402 0913 0603 0344,42,M,H,501.0,SINGLE FAMILY DWELLING,,,IC,Invest Cont,310.0,998.0,,,20400 GERMAIN ST,,34.2622,-118.5790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240711752,10/02/2024 12:00:00 AM,10/01/2024 12:00:00 AM,1800,7,Wilshire,759,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),,0,,,101.0,STREET,,,IC,Invest Cont,420.0,,,,1100 CRENSHAW BL,,34.0531,-118.3239
240505317,02/03/2024 12:00:00 AM,02/03/2024 12:00:00 AM,2220,5,Harbor,564,2,850,INDECENT EXPOSURE,1822 0529 2004,29,F,B,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,850.0,,,,400 W 3RD ST,,33.7415,-118.2857
240804529,01/15/2024 12:00:00 AM,01/11/2024 12:00:00 AM,1659,8,West LA,882,1,440,THEFT PLAIN - PETTY ($950 & UNDER),1822 0344,58,M,J,203.0,OTHER BUSINESS,,,IC,Invest Cont,440.0,,,,2100 SAWTELLE BL,,34.0396,-118.4425
241309619,05/22/2024 12:00:00 AM,05/20/2024 12:00:00 AM,2300,13,Newton,1361,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329 1307,45,F,H,122.0,"VEHICLE, PASSENGER/TRUCK",,,IC,Invest Cont,740.0,,,,100 W 47TH ST,,34.0010,-118.2740


## 3. SIGNIFICADO Y FORMATO DE LAS COLUMNAS

Ya sabemos que podemos trabajar con el dataset y lo tenemos cargado con los nombre de las columnas en su formato correto

Significado de las columnas:
- dr_no: Número de expediente oficial compuesto por un año de 2 dígitos, un ID de área y 5 dígitos.
- date_rptd: Fecha de Reporte. Indica el día en que el crimen o incidente fue oficialmente reportado (MM/DD/AAAA)
- date_occ: Fecha de Ocurrencia. Indica el día real en que el crimen tuvo lugar (MM/DD/AAAA)
- time_occ: En horario militar de 24 horas.
- area: Áreas Geográficas o Divisiones de Patrulla numeradas secuencialmente del 1 al 21.
- area_name: Nombre de las áreas Geográficas o Divisiones de Patrulla
- rpt_dist_no: Número de distrito de la patrulla del oficial que informó el incidente.
- part_1_2: Indica si el incidente es un crimen de la Parte 1 (crímenes más graves) o de la Parte 2 (crímenes menos graves).
- crm_cd: Código de delito de 3 dígitos del crimen cometido.
- crm_cd_desc: Descripción del delito del crimen cometido.
- mocodes: Modus Operandi o la manera distintiva o característica en que una persona lleva a cabo una actividad criminal.
- vict_age: Edad de la víctima
- vict_sex: Sexo de la víctima.
- vict_descent: Código de descendencia de la víctima
- premis_cd: El Código de Instalación es un código de 3 dígitos que identifica el tipo de lugar donde ocurrió el incidente
- premis_desc: Descripción de la Instalación.
- weapon_used_cd: El Código de Arma es un código de 3 dígitos que identifica el tipo de arma utilizada en el incidente
- weapon_desc: Descripción del Arma.
- status: Estado del caso. (IC es el valor predeterminado)
- status_desc: Define el Código de Estado proporcionado.
- crm_cd_1: Indica el crimen cometido. El Código de Crimen 1 es el principal y el más grave. Los Códigos de Crimen 2, 3 y 4 son, respectivamente, delitos menos graves. Los números de clase de crimen más bajos son más graves.
- crm_cd_2: Puede contener un código para un crimen adicional, menos grave que el Código de Crimen 1.
- crm_cd_3: Puede contener un código para un crimen adicional, menos grave que el Código de Crimen 1.
- crm_cd_4: Puede contener un código para un crimen adicional, menos grave que el Código de Crimen 1.
- location: Dirección postal del incidente del crimen redondeada al centenar de la cuadra más cercana para mantener el anonimato.
- cross_street: El nombre de la calle que se cruza con la calle principal donde ocurrió el incidente.
- lat: Latitud.
- lon: Longitud.

Códigos de descendencia: 
- A: Otros Asiáticos
- B: Afroamericano
- C: Chino
- D: Camboyano
- F: Filipino
- G: Guamés
- H: Hispano/Latino/Mexicano
- I: Indio Americano/Nativo de Alaska
- J: Japonés
- K: Coreano
- L: Laosiano
- O: Otro
- P: Isleño del Pacífico
- S: Samoano
- U: Hawaiano
- V: Vietnamita
- W: Blanco
- X: Desconocido
- Z: Hindú Asiático

Revisamos los tipos de datos de cada variable observando entre el .info() y la viasualización de la propia tabla y no se requieren cambios 

También observamos en el count que hay nulos en varias columnas que trataremos posteriormente

In [47]:
pd.set_option('display.max_columns', None)
df.sample()

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
211206686,02/11/2021 12:00:00 AM,02/10/2021 12:00:00 AM,2300,12,77th Street,1268,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,200 E 80TH ST,,33.967,-118.2717


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331647 entries, 201319771 to 242111298
Data columns (total 27 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   date_rptd       331647 non-null  object 
 1   date_occ        331647 non-null  object 
 2   time_occ        331647 non-null  int64  
 3   area            331647 non-null  int64  
 4   area_name       331647 non-null  object 
 5   rpt_dist_no     331647 non-null  int64  
 6   part_1_2        331647 non-null  int64  
 7   crm_cd          331647 non-null  int64  
 8   crm_cd_desc     331647 non-null  object 
 9   mocodes         281846 non-null  object 
 10  vict_age        331647 non-null  int64  
 11  vict_sex        284139 non-null  object 
 12  vict_descent    284136 non-null  object 
 13  premis_cd       331641 non-null  float64
 14  premis_desc     331424 non-null  object 
 15  weapon_used_cd  108186 non-null  float64
 16  weapon_desc     108186 non-null  object 
 17  stat

## 4. TRATAMIENTO DE VARIABLES

### FORMATO DE VARIABLES 

Las columnas 'date_rptd' y 'date_occ' vamos a eliminar la hora porque en todos los registros nos dicen que ha sido a las '12:00:00 AM'. Esta información es errónea según la columna 'time_occ' y además no nos aporta información como variable porque es una constante en todos los registros

In [49]:
df.sample()

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
200909062,04/28/2020 12:00:00 AM,04/17/2020 12:00:00 AM,1200,9,Van Nuys,909,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),0385 0344,33,M,O,103.0,ALLEY,,,IC,Invest Cont,420.0,,,,13600 VALERIO ST,,34.2049,-118.4291


Eliminamos la parte de la hora

In [50]:
df['date_rptd'] = df['date_rptd'].str.split(' ').str[0]
df['date_occ'] = df['date_occ'].str.split(' ').str[0]

Hacemos un datetime de las columnas de fechas

In [51]:
df.sample()

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
241807082,03/06/2024,03/04/2024,1900,18,Southeast,1822,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),,0,,,104.0,DRIVEWAY,,,IC,Invest Cont,420.0,,,,100 E COLDEN AV,,33.9492,-118.2739


In [52]:
df['date_rptd'] = pd.to_datetime(df['date_rptd'], format='%m/%d/%Y')
df['date_occ'] = pd.to_datetime(df['date_occ'], format='%m/%d/%Y')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331647 entries, 201319771 to 242111298
Data columns (total 27 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date_rptd       331647 non-null  datetime64[ns]
 1   date_occ        331647 non-null  datetime64[ns]
 2   time_occ        331647 non-null  int64         
 3   area            331647 non-null  int64         
 4   area_name       331647 non-null  object        
 5   rpt_dist_no     331647 non-null  int64         
 6   part_1_2        331647 non-null  int64         
 7   crm_cd          331647 non-null  int64         
 8   crm_cd_desc     331647 non-null  object        
 9   mocodes         281846 non-null  object        
 10  vict_age        331647 non-null  int64         
 11  vict_sex        284139 non-null  object        
 12  vict_descent    284136 non-null  object        
 13  premis_cd       331641 non-null  float64       
 14  premis_desc     331424 non-nul

In [53]:
df.sample()

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
241406397,2024-02-22,2024-02-21,2200,14,Pacific,1454,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,13100 MAXELLA AV,,33.9893,-118.4363


### DUPLICADOS

Comprobamos si hay duplicados y hacemos comparaciones para asegurarnos

In [54]:
duplicados = df.duplicated().sum()
print(f'Recuento de duplicados en df: ', duplicados)

Recuento de duplicados en df:  355


Nos salen que hay duplicados de las 27 columnas y que solamente es diferente el índice. Hacemos unas visualizaciones para comprobar que son realmente duplicados y vemos que es cierto, por lo que borramos todos los duplicados

In [55]:
lineas_original = len(df)
lineas_original

331647

In [56]:
df.loc[df.duplicated()].head(2)

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
201614391,2020-11-02,2020-10-12,1100,16,Foothill,1664,1,510,VEHICLE - STOLEN,,0,,,108.0,PARKING LOT,,,IC,Invest Cont,510.0,,,,9200 TUJUNGA AV,,34.2363,-118.3804
202017850,2020-12-01,2020-12-01,1427,20,Olympic,2076,2,624,BATTERY - SIMPLE ASSAULT,0913 0445 0447 1309 0319,51,F,H,719.0,MEDICAL/DENTAL OFFICES,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,624.0,,,,1400 S VERMONT AV,,34.0436,-118.296


In [57]:
# Buscamos una en concreto por hacer la comprobación
df.loc[(df.mocodes == '0421 1528 1822 0903 2053')]

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
220706170,2022-02-26,2022-02-26,1200,7,Wilshire,752,2,930,CRIMINAL THREATS - NO WEAPON DISPLAYED,0421 1528 1822 0903 2053,0,M,O,501.0,SINGLE FAMILY DWELLING,511.0,VERBAL THREAT,IC,Invest Cont,930.0,946.0,,,1100 S ORLANDO AV,,34.0572,-118.3739
220706178,2022-02-26,2022-02-26,1200,7,Wilshire,752,2,930,CRIMINAL THREATS - NO WEAPON DISPLAYED,0421 1528 1822 0903 2053,0,M,O,501.0,SINGLE FAMILY DWELLING,511.0,VERBAL THREAT,IC,Invest Cont,930.0,946.0,,,1100 S ORLANDO AV,,34.0572,-118.3739


Eliminamos los duplicados

In [58]:
df.drop_duplicates(inplace=True)

In [59]:
lineas_despues = len(df)

diff = lineas_original - lineas_despues

print("Filas antes:", lineas_original)
print("Filas después:", lineas_despues)
print("Diferencia:", diff)

Filas antes: 331647
Filas después: 331292
Diferencia: 355


### VALORES ÚNICOS

Comprobamos los valores únicos en la tabla para desestimar variables. En este caso, no hay constantes

In [60]:
df.nunique().sort_values(ascending=True)

# Si hubiera alguna columna que eliminar haríamos df.drop(columns = '...', inplace=True)

part_1_2               2
crm_cd_4               3
vict_sex               4
status                 6
status_desc            6
vict_descent          19
area_name             21
area                  21
crm_cd_3              22
weapon_used_cd        79
weapon_desc           79
crm_cd_2             103
vict_age             103
crm_cd               137
crm_cd_desc          137
crm_cd_1             138
premis_desc          303
premis_cd            310
rpt_dist_no         1166
time_occ            1439
date_occ            1856
date_rptd           1871
lon                 4870
lat                 5169
cross_street        6593
location           50606
mocodes           117498
dtype: int64

### VALORES NULOS

Comprobamos los nulos que existen según el tipo de columna: Categórica y numérica

Primero haremos las comprobaciones, después expondré las conclusiones y a continuación realizaré los cambios

In [61]:
df.select_dtypes(include='number').isna().sum().sort_values(ascending=False)

crm_cd_4          331272
crm_cd_3          330533
crm_cd_2          308456
weapon_used_cd    223241
premis_cd              6
crm_cd_1               2
vict_age               0
crm_cd                 0
part_1_2               0
rpt_dist_no            0
area                   0
time_occ               0
lat                    0
lon                    0
dtype: int64

In [62]:
df.select_dtypes(exclude='number').isna().sum().sort_values(ascending=False)

cross_street    280359
weapon_desc     223241
mocodes          49664
vict_descent     47375
vict_sex         47372
premis_desc        223
area_name            0
date_occ             0
date_rptd            0
crm_cd_desc          0
status               0
status_desc          0
location             0
dtype: int64

In [63]:
df.vict_descent.unique()

array(['B', 'W', 'H', nan, 'X', 'O', 'A', 'P', 'K', 'C', 'F', 'V', 'J',
       'Z', 'I', 'S', 'U', 'D', 'G', 'L'], dtype=object)

In [64]:
df.loc[df.vict_descent=='-']

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1


In [65]:
df.vict_sex.unique()

array(['M', 'F', nan, 'X', 'H'], dtype=object)

In [66]:
df.loc[df.vict_sex=='-']

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1


In [67]:
df.loc[(df.premis_cd.notna())&(df.premis_desc.isna())].premis_cd.value_counts()

premis_cd
418.0    137
256.0     64
972.0      7
973.0      4
974.0      4
976.0      1
Name: count, dtype: int64

In [68]:
df.loc[(df.crm_cd_1.isna())]

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
230114704,2023-06-22,2023-06-22,1801,1,Central,162,1,442,SHOPLIFTING - PETTY THEFT ($950 & UNDER),0325,25,M,H,404.0,DEPARTMENT STORE,,,IC,Invest Cont,,442.0,,,700 S FLOWER ST,,34.0487,-118.2588
230106125,2023-02-01,2023-02-01,1855,1,Central,129,1,330,BURGLARY FROM VEHICLE,0344 1609,32,M,B,101.0,STREET,,,IC,Invest Cont,,330.0,,,GAREY,JACKSON,34.0513,-118.2344


In [69]:
df.loc[(df.status.isna())]

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1


In [70]:
df.loc[(df.crm_cd_1 == 510.0)]

Unnamed: 0_level_0,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
dr_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
200317116,2020-09-10,2020-09-10,1351,3,Southwest,329,1,510,VEHICLE - STOLEN,,0,,,108.0,PARKING LOT,,,IC,Invest Cont,510.0,,,,3300 S FIGUEROA ST,,34.0225,-118.2796
201916864,2020-11-13,2020-11-13,1830,19,Mission,1984,1,510,VEHICLE - STOLEN,,0,,,123.0,PARKING UNDERGROUND/BUILDING,,,IC,Invest Cont,510.0,,,,14900 ROSCOE BL,,34.2265,-118.4565
201318949,2020-10-24,2020-10-23,2200,13,Newton,1362,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,48TH ST,AVALON BL,33.9998,-118.2652
201910635,2020-06-07,2020-06-07,550,19,Mission,1907,1,510,VEHICLE - STOLEN,,0,,,104.0,DRIVEWAY,,,AA,Adult Arrest,510.0,,,,12800 FENTON AV,,34.3062,-118.4222
202106989,2020-03-06,2020-03-05,1830,21,Topanga,2118,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,8200 DELCO AV,,34.2182,-118.5787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240511143,2024-09-06,2024-09-03,100,5,Harbor,566,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,300 W 9TH ST,,33.7360,-118.2836
241011254,2024-09-25,2024-09-10,1100,10,West Valley,1007,1,510,VEHICLE - STOLEN,,0,,,104.0,DRIVEWAY,,,IC,Invest Cont,510.0,,,,7800 AMESTOY AV,,34.2121,-118.5055
241813573,2024-10-02,2024-10-02,1,18,Southeast,1822,1,510,VEHICLE - STOLEN,,0,,,108.0,PARKING LOT,,,IC,Invest Cont,510.0,,,,100 W 104TH ST,,33.9469,-118.2772
241105695,2024-02-14,2024-02-13,2300,11,Northeast,1183,1,510,VEHICLE - STOLEN,,0,,,101.0,STREET,,,IC,Invest Cont,510.0,,,,1600 LUCRETIA AV,,34.0824,-118.2555


## 5. GUARDADO DE RESULTADOS

In [71]:
ruta_principal = 'C:/Users/Oscar/OneDrive - FM4/Escritorio/EVOLVE/Data Science/EVOLVE/Fernando_Costa/Practicas/Mini_Proyecto_EDA/'
carpeta = '002_archivos/'

In [72]:
ruta_trabajo = ruta_principal + carpeta + 'trabajo_resultado_calidad.pickle'

Guardar los archivos

In [73]:
df.to_pickle(ruta_trabajo)