In [197]:
import pandas as pd
from ydata_profiling import ProfileReport

In [198]:
data = pd.read_csv("data_act_1.csv", sep=";")

data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
0,160903280,Assault / Battery,2016-03-30T00:00:00,18:42,2016-03-30T18:42:00,REP,100 Block Of Chilton Av,San Francisco,CA,1,,Premise Address
1,160912272,Homeless Complaint,2016-03-31T00:00:00,15:31,2016-03-31T15:31:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
2,160912590,Susp Info,2016-03-31T00:00:00,16:49,2016-03-31T16:49:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
3,160912801,Report,2016-03-31T00:00:00,17:38,2016-03-31T17:38:00,GOA,500 Block Of 7th St,San Francisco,CA,1,,Premise Address
4,160912811,594,2016-03-31T00:00:00,17:42,2016-03-31T17:42:00,REP,Beale St/bryant St,San Francisco,CA,1,,Intersection


In [199]:
#profile = ProfileReport(data, explorative=True)
#profile.to_file("data_profile.html")

In [200]:
duplicates = data[data.duplicated(subset='CrimeId', keep=False)]
duplicates.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
26,160913455,Vandalism,2016-03-31T00:00:00,20:53,2016-03-31T20:53:00,ND,1600 Block Of Sunnydale Av,San Francisco,CA,1,,Premise Address
1707,160913455,Susp,2016-04-01T00:00:00,18:29,2016-04-01T18:29:00,GOA,Geary St/larkin St,San Francisco,CA,1,,Intersection
3792,160913455,Passing Call,2016-04-02T00:00:00,17:11,2016-04-02T17:11:00,Not recorded,900 Block Of Market St,San Francisco,CA,1,,Premise Address
7045,160950496,Passing Call,2016-04-04T00:00:00,6:51,2016-04-04T06:51:00,HAN,University St/felton St,San Francisco,CA,1,,Intersection
7046,160950496,Suspicious Vehicle,2016-04-04T00:00:00,6:51,2016-04-04T06:51:00,ND,1400 Block Of Cabrillo St,San Francisco,CA,1,,Premise Address


In [201]:
print(f"Number of duplicate rows based on 'CrimeId': {len(duplicates)}")

Number of duplicate rows based on 'CrimeId': 6


In [202]:
data = data.sort_values(by=['CrimeId', 'CallDateTime']).drop_duplicates(subset='CrimeId', keep='first')
data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
0,160903280,Assault / Battery,2016-03-30T00:00:00,18:42,2016-03-30T18:42:00,REP,100 Block Of Chilton Av,San Francisco,CA,1,,Premise Address
1,160912272,Homeless Complaint,2016-03-31T00:00:00,15:31,2016-03-31T15:31:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
2,160912590,Susp Info,2016-03-31T00:00:00,16:49,2016-03-31T16:49:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
3,160912801,Report,2016-03-31T00:00:00,17:38,2016-03-31T17:38:00,GOA,500 Block Of 7th St,San Francisco,CA,1,,Premise Address
4,160912811,594,2016-03-31T00:00:00,17:42,2016-03-31T17:42:00,REP,Beale St/bryant St,San Francisco,CA,1,,Intersection


In [203]:
filtered_data = data[data['AgencyId'] == "CA"]
filtered_data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
5771,160942112,Auto Boost / Strip,2016-04-03T00:00:00,14:30,2016-04-03T14:30:00,REP,Martin Luther King Dr/bowling Green Dr,,,CA,,1
8021,160952280,Auto Boost / Strip,2016-04-04T00:00:00,14:46,2016-04-04T14:46:00,REP,Martin Luther King Dr/nancy Pelosi Dr,S,,CA,,1
8473,160953118,Auto Boost / Strip,2016-04-04T00:00:00,18:11,2016-04-04T18:11:00,REP,Conservatory Drive E/john F Kennedy Dr,,,CA,,1


In [204]:
# Corregir filas donde AgencyId es "CA" y State es nulo
data.loc[(data['AgencyId'] == "CA") & (data['State'].isnull()), 'State'] = "CA"

# Si AddressType es un dígito, usarlo como AgencyId
data.loc[data['AddressType'].str.isdigit(), 'AgencyId'] = data['AddressType']

# Inferir AddressType por la dirección and AddressType es un dígito
data.loc[data['AddressType'].str.isdigit(), 'AddressType'] = data['AddressType'].apply(lambda x: "Intersection" if "/" in x else "Premise Address")
data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
0,160903280,Assault / Battery,2016-03-30T00:00:00,18:42,2016-03-30T18:42:00,REP,100 Block Of Chilton Av,San Francisco,CA,1,,Premise Address
1,160912272,Homeless Complaint,2016-03-31T00:00:00,15:31,2016-03-31T15:31:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
2,160912590,Susp Info,2016-03-31T00:00:00,16:49,2016-03-31T16:49:00,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
3,160912801,Report,2016-03-31T00:00:00,17:38,2016-03-31T17:38:00,GOA,500 Block Of 7th St,San Francisco,CA,1,,Premise Address
4,160912811,594,2016-03-31T00:00:00,17:42,2016-03-31T17:42:00,REP,Beale St/bryant St,San Francisco,CA,1,,Intersection


In [205]:
filtered_crime = data[data['CrimeId'] == 160942112]
filtered_crime

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDateTime,Disposition,Address,City,State,AgencyId,Range,AddressType
5771,160942112,Auto Boost / Strip,2016-04-03T00:00:00,14:30,2016-04-03T14:30:00,REP,Martin Luther King Dr/bowling Green Dr,,CA,1,,Premise Address


In [206]:
# Rellenar valores nulos en la columna 'City' con "San Francisco"
data['City'] = data['City'].fillna("San Francisco")

# Normalizar a Title Case
data['City'] = data['City'].str.title()

# Mapear variantes a valores consistentes
city_mapping = {
    "San Francisco": "San Francisco",
    " S": "San Francisco",
    "Treasure Isla": "Treasure Island"
}
data['City'] = data['City'].replace(city_mapping)

data['City'].unique()

array(['San Francisco', 'Daly City', 'Treasure Island', 'Yerba Buena',
       'Presidio', 'Brisbane'], dtype=object)

In [207]:
data.loc[data['AddressType'] == "Intersectioon", 'AddressType'] = "Intersection"
data['AddressType'].unique()

array(['Premise Address', 'Intersection', 'Common Location',
       'Geo-Override'], dtype=object)

In [208]:
# Normalizar OffenseDate a formato YYYY-MM-DD
data['OffenseDate'] = pd.to_datetime(data['OffenseDate']).dt.strftime('%Y-%m-%d')

# Normalizar CallTime a formato HH:MM:SS
data['CallTime'] = pd.to_datetime(data['CallTime'], format='%H:%M').dt.strftime('%H:%M:%S')

# Renombrar CallDateTime a CallDate y estandarizar formato YYYY-MM-DD
data = data.rename(columns={'CallDateTime': 'CallDate'})
data['CallDate'] = pd.to_datetime(data['CallDate']).dt.strftime('%Y-%m-%d')

data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDate,Disposition,Address,City,State,AgencyId,Range,AddressType
0,160903280,Assault / Battery,2016-03-30,18:42:00,2016-03-30,REP,100 Block Of Chilton Av,San Francisco,CA,1,,Premise Address
1,160912272,Homeless Complaint,2016-03-31,15:31:00,2016-03-31,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
2,160912590,Susp Info,2016-03-31,16:49:00,2016-03-31,GOA,2300 Block Of Market St,San Francisco,CA,1,,Premise Address
3,160912801,Report,2016-03-31,17:38:00,2016-03-31,GOA,500 Block Of 7th St,San Francisco,CA,1,,Premise Address
4,160912811,594,2016-03-31,17:42:00,2016-03-31,REP,Beale St/bryant St,San Francisco,CA,1,,Intersection


In [209]:
data = data.drop(columns=["Range", "State", "AgencyId"])
data.head()

Unnamed: 0,CrimeId,OriginalCrimeTypeName,OffenseDate,CallTime,CallDate,Disposition,Address,City,AddressType
0,160903280,Assault / Battery,2016-03-30,18:42:00,2016-03-30,REP,100 Block Of Chilton Av,San Francisco,Premise Address
1,160912272,Homeless Complaint,2016-03-31,15:31:00,2016-03-31,GOA,2300 Block Of Market St,San Francisco,Premise Address
2,160912590,Susp Info,2016-03-31,16:49:00,2016-03-31,GOA,2300 Block Of Market St,San Francisco,Premise Address
3,160912801,Report,2016-03-31,17:38:00,2016-03-31,GOA,500 Block Of 7th St,San Francisco,Premise Address
4,160912811,594,2016-03-31,17:42:00,2016-03-31,REP,Beale St/bryant St,San Francisco,Intersection


In [210]:
#profile_after = ProfileReport(data, explorative=True)
#profile_after.to_file("data_profile_after.html")

In [211]:
data.to_csv("ARMAS_CARRERA_ROBERTO _Actividad_1.csv", index=False)