# Collision Model Notebook

## Ingesta de información y transformación de datos inicial

In [7]:
import requests as rq
import json
import pandas as pd
import numpy as np
import time

In [2]:

#---------------------------------------------------------
# TRAFIC COLLISION DATA FROM 2010 TO PRESENT (Los Angeles)
#---------------------------------------------------------

# 1) Ingesta de datos desde via API Endpoint

api_ep_la_collisions = 'https://data.lacity.org/resource/d5tf-ez2w.json'

LIMIT = 50000 # Maximum allowed per request (SODA 2.0)
MAX_RETRIES = 3
RETRY_DELAY = 10 # Seconds to wait between attempts

# Storage for the data
la_collisions_data = []
offset = 0
total_retrieved = 0
EXPECTED_TOTAL_ROWS = 621677

print("Starting data retrieval...")

while True:
    retries = 0
    success = False
    last_error_message = ""

    while retries < MAX_RETRIES and not success:
        try:
            params = {'$limit': LIMIT, '$offset': offset}
            response = rq.get(api_ep_la_collisions, params=params)

            if response.status_code == 200:
                chunk = response.json()
                # The condition 'if not chunk:' is redundant as len(chunk) < LIMIT will cover it
                # if the chunk is empty.

                la_collisions_data.extend(chunk)
                retrieved = len(chunk)
                total_retrieved += retrieved
                print(f"✅ Partially ingested {retrieved} rows (Total: {total_retrieved})")
                offset += retrieved
                success = True

            else:
                last_error_message = f"HTTP Error {response.status_code}: {response.text}"
                print(f"⚠️ {last_error_message}. Retrying...")
                retries += 1
                if retries < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)

        except Exception as e:
            last_error_message = f"Exception: {str(e)}"
            print(f"⚠️ {last_error_message}. Retrying...")
            retries += 1
            if retries < MAX_RETRIES:
                time.sleep(RETRY_DELAY)

    if not success:
        print(f"❌ Failed after {MAX_RETRIES} attempts. Stopping. Last error: {last_error_message}")
        break

    # If the retrieved chunk is less than the LIMIT, it means we've reached the end of the data.
    if len(chunk) < LIMIT:
        break

print(f"✅ Final ingestion completed: {len(la_collisions_data)} rows retrieved.")

# Added: Optional: Verify the total number of rows
if len(la_collisions_data) == EXPECTED_TOTAL_ROWS:
    print(f"🎉 Successfully retrieved all {EXPECTED_TOTAL_ROWS} expected rows.")
else:
    print(f"❗ Warning: Expected {EXPECTED_TOTAL_ROWS} rows, but retrieved {len(la_collisions_data)} rows.")

Starting data retrieval...
✅ Partially ingested 50000 rows (Total: 50000)
✅ Partially ingested 50000 rows (Total: 100000)
✅ Partially ingested 50000 rows (Total: 150000)
✅ Partially ingested 50000 rows (Total: 200000)
✅ Partially ingested 50000 rows (Total: 250000)
✅ Partially ingested 50000 rows (Total: 300000)
✅ Partially ingested 50000 rows (Total: 350000)
✅ Partially ingested 50000 rows (Total: 400000)
✅ Partially ingested 50000 rows (Total: 450000)
✅ Partially ingested 50000 rows (Total: 500000)
✅ Partially ingested 50000 rows (Total: 550000)
✅ Partially ingested 50000 rows (Total: 600000)
✅ Partially ingested 21677 rows (Total: 621677)
✅ Final ingestion completed: 621677 rows retrieved.
🎉 Successfully retrieved all 621677 expected rows.


In [3]:
#Transformamos los datos en dataframe utilizando la funcion json_normalize

df_la_collisions = pd.json_normalize(la_collisions_data)

df_la_collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621677 entries, 0 to 621676
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   dr_no                        621677 non-null  object
 1   date_rptd                    621677 non-null  object
 2   date_occ                     621677 non-null  object
 3   time_occ                     621677 non-null  object
 4   area                         621677 non-null  object
 5   area_name                    621677 non-null  object
 6   rpt_dist_no                  621677 non-null  object
 7   crm_cd                       621677 non-null  object
 8   crm_cd_desc                  621677 non-null  object
 9   mocodes                      534353 non-null  object
 10  vict_age                     533483 non-null  object
 11  vict_sex                     610980 non-null  object
 12  vict_descent                 610029 non-null  object
 13  premis_cd     

In [4]:
df = df_la_collisions

# Mostrar las primeras filas del dataset
df.head()

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,crm_cd,crm_cd_desc,mocodes,...,cross_street,:@computed_region_qz3q_ghft,:@computed_region_k96s_3jcv,:@computed_region_tatf_ua23,:@computed_region_ur2y_g4cx,:@computed_region_kqwf_mjcx,:@computed_region_2dna_qi2s,location_1.latitude,location_1.longitude,location_1.human_address
0,212013850,2021-09-03T00:00:00.000,2021-09-02T00:00:00.000,2335,20,Olympic,2021,997,TRAFFIC COLLISION,3004 3027 3034 4027 3036 3101 3401 3701,...,6TH ST,22722,588,875,36.0,7,86,34.063,-118.3141,"{""address"": """", ""city"": """", ""state"": """", ""zip""..."
1,221417787,2022-10-17T00:00:00.000,2022-10-17T00:00:00.000,1620,14,Pacific,1406,997,TRAFFIC COLLISION,4027 3011 3028 3034 3037 3101 3401 3701,...,MOTOR AV,23451,881,1358,9.0,6,74,34.029,-118.4113,"{""address"": """", ""city"": """", ""state"": """", ""zip""..."
2,221418141,2022-10-26T00:00:00.000,2022-10-26T00:00:00.000,1135,14,Pacific,1434,997,TRAFFIC COLLISION,4027 3011 3025 3034 3037 3101 3401 3701,...,ROSEWOOD AV,24031,891,855,10.0,10,27,34.0052,-118.4478,"{""address"": """", ""city"": """", ""state"": """", ""zip""..."
3,222017859,2022-12-01T00:00:00.000,2022-12-01T00:00:00.000,230,20,Olympic,2044,997,TRAFFIC COLLISION,3003 0913 3026 3035 3037 3101 3401 3701 4020,...,SAN MARINO ST,22723,607,647,,12,89,34.0545,-118.3009,"{""address"": """", ""city"": """", ""state"": """", ""zip""..."
4,190319651,2019-08-24T00:00:00.000,2019-08-24T00:00:00.000,450,3,Southwest,356,997,TRAFFIC COLLISION,3036 3004 3026 3101 4003,...,NORMANDIE AV,22724,691,916,7.0,14,32,34.0255,-118.3002,"{""address"": """", ""city"": """", ""state"": """", ""zip""..."


In [5]:
# Eliminar columnas irrelevantes para modelado temporal
columnas_a_eliminar = [
    'dr_no', 'rpt_dist_no', 'crm_cd', 'crm_cd_desc',
    'cross_street', 'location_1.human_address'
]
# Eliminar columnas con nombres tipo @computed
columnas_a_eliminar += [col for col in df.columns if col.startswith(':@computed')]

# Aplicar eliminación
df_limpio = df.drop(columns=columnas_a_eliminar)

# Convertir columnas de fecha a datetime
df_limpio['date_rptd'] = pd.to_datetime(df_limpio['date_rptd'], errors='coerce')
df_limpio['date_occ'] = pd.to_datetime(df_limpio['date_occ'], errors='coerce')

# Revisar valores nulos
nulos = df_limpio.isnull().sum()

# Mostrar resumen del dataframe limpio y los nulos encontrados
df_limpio.info(), nulos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621677 entries, 0 to 621676
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   date_rptd             621677 non-null  datetime64[ns]
 1   date_occ              621677 non-null  datetime64[ns]
 2   time_occ              621677 non-null  object        
 3   area                  621677 non-null  object        
 4   area_name             621677 non-null  object        
 5   mocodes               534353 non-null  object        
 6   vict_age              533483 non-null  object        
 7   vict_sex              610980 non-null  object        
 8   vict_descent          610029 non-null  object        
 9   premis_cd             620718 non-null  object        
 10  premis_desc           620717 non-null  object        
 11  location              621677 non-null  object        
 12  location_1.latitude   621677 non-null  object        
 13 

(None,
 date_rptd                   0
 date_occ                    0
 time_occ                    0
 area                        0
 area_name                   0
 mocodes                 87324
 vict_age                88194
 vict_sex                10697
 vict_descent            11648
 premis_cd                 959
 premis_desc               960
 location                    0
 location_1.latitude         0
 location_1.longitude        0
 dtype: int64)

In [6]:
# Asegurarse de que vict_age sea numérico
df_limpio['vict_age'] = pd.to_numeric(df_limpio['vict_age'], errors='coerce')

# Imputar vict_age con la mediana
df_limpio['vict_age'] = df_limpio['vict_age'].fillna(df_limpio['vict_age'].median())

# Imputar categóricas con 'Unknown'
df_limpio['vict_sex'] = df_limpio['vict_sex'].fillna('Unknown')
df_limpio['vict_descent'] = df_limpio['vict_descent'].fillna('Unknown')
df_limpio['premis_desc'] = df_limpio['premis_desc'].fillna('Unknown')

# Imputar premis_cd con la moda
df_limpio['premis_cd'] = df_limpio['premis_cd'].fillna(df_limpio['premis_cd'].mode()[0])

# Eliminar mocodes si no se necesita
df_limpio = df_limpio.drop(columns=['mocodes'])

# Verificar que no haya nulos
nulos_post = df_limpio.isnull().sum()
print(nulos_post)

date_rptd               0
date_occ                0
time_occ                0
area                    0
area_name               0
vict_age                0
vict_sex                0
vict_descent            0
premis_cd               0
premis_desc             0
location                0
location_1.latitude     0
location_1.longitude    0
dtype: int64


In [8]:

# Seleccionar columnas numéricas a revisar
columnas_numericas = ['time_occ', 'vict_age', 'location_1.latitude', 'location_1.longitude']

for col in columnas_numericas:
    df_limpio[col] = pd.to_numeric(df_limpio[col], errors='coerce')

# Aplicar detección de outliers usando IQR
outliers_info = {}

for col in columnas_numericas:
    Q1 = df_limpio[col].dropna().quantile(0.25)
    Q3 = df_limpio[col].dropna().quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    outliers = df_limpio[(df_limpio[col] < limite_inferior) | (df_limpio[col] > limite_superior)]
    outliers_info[col] = {
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'Limite inferior': limite_inferior,
        'Limite superior': limite_superior,
        'Cantidad outliers': len(outliers)
    }

pd.DataFrame(outliers_info).T

Unnamed: 0,Q1,Q3,IQR,Limite inferior,Limite superior,Cantidad outliers
time_occ,930.0,1825.0,895.0,-412.5,3167.5,0.0
vict_age,30.0,49.0,19.0,1.5,77.5,16954.0
location_1.latitude,34.0172,34.1758,0.1586,33.7793,34.4137,12072.0
location_1.longitude,-118.4396,-118.2805,0.1591,-118.67825,-118.04185,1001.0


In [9]:
# Crear una copia para filtrar los outliers
df_filtrado = df_limpio.copy()

# Aplicar filtros por cada columna con outliers
for col in ['vict_age', 'location_1.latitude', 'location_1.longitude']:
    Q1 = df_filtrado[col].quantile(0.25)
    Q3 = df_filtrado[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtrado = df_filtrado[(df_filtrado[col] >= lower_bound) & (df_filtrado[col] <= upper_bound)]

df_filtrado.shape

(593193, 13)

## Preparación del dataset para entrenamiento de modelo de Random Forest

In [10]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
df = df_filtrado

In [12]:
df['date_occ'] = pd.to_datetime(df['date_occ'])

In [13]:
# Filtrar por Hollywood y agrupar por fecha
df = df[df['area_name'] == 'Hollywood']
daily = df.groupby('date_occ').size().reset_index(name='collision_count')

# Crear features
daily['dayofyear'] = daily['date_occ'].dt.dayofyear
daily['weekday'] = daily['date_occ'].dt.weekday
daily['year'] = daily['date_occ'].dt.year
daily['lag_1'] = daily['collision_count'].shift(1)
daily['lag_2'] = daily['collision_count'].shift(2)
daily['rolling_mean_3'] = daily['collision_count'].rolling(3).mean()

In [14]:
# Limpiar NaNs
daily = daily.dropna()

# Variables
X = daily[['dayofyear', 'weekday', 'year', 'lag_1', 'lag_2', 'rolling_mean_3']]
y = daily['collision_count']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [16]:
# Random Forest
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)