# Limpieza datos 
**Objetivo:** limpiar y preparar los datos del mes de diciembre 2019. 
Se conservarán las columnas: 
`tpep_pickup_datetime`, `tpep_dropoff_datetime`, `passenger_count`,
`trip_distance`, `pulocationid`, `dolocationid`, `payment_type`,
`fare_amount`, `tip_amount`, `total_amount`, `congestion_surcharge`.


In [40]:
import os 
import sqlite3
import pandas as pd
import numpy as np

BASE_DIR = os.path.dirname(os.getcwd()) 
db_path = os.path.join(BASE_DIR, "data", "raw", "2019-12.sqlite")
lookup_path = r"C:\Users\Laptop\Desktop\nyc-taxi-demanda-2019\nyc-taxi-demanda-2019\data\taxi_zone_lookup.csv"
processed_csv = r"data/processed/2019-12.csv"
processed_sqlite = r"data/processed/2019-12.sqlite"

SAMPLE_LIMIT = None

In [42]:
# Cargar datos desde SQLite (tabla 'tripdata')
# Usar el path absoluto definido en la celda 1
conn = sqlite3.connect(db_path)

if SAMPLE_LIMIT:
    query = f"SELECT * FROM tripdata LIMIT {SAMPLE_LIMIT}"
else:
    query = "SELECT * FROM tripdata;"

df = pd.read_sql_query(query, conn)
conn.close()

print("Registros cargados:", len(df))
print("Columnas cargadas:", df.columns.tolist())



Registros cargados: 6896317
Columnas cargadas: ['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid', 'dolocationid', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge']


In [43]:
# Seleccionamos las columnas que queremos conservar

cols_keep = [
    'tpep_pickup_datetime', 'tpep_dropoff_datetime',
    'passenger_count', 'trip_distance',
    'pulocationid', 'dolocationid',
    'payment_type', 'fare_amount', 'tip_amount',
    'total_amount', 'congestion_surcharge'
]

# Verificamos que todas existan 
cols_miss = [c for c in cols_keep if c not in df.columns]
if cols_miss:
    raise ValueError("Faltan columnas esperadas en la tabla;", cols_miss)

df = df[cols_keep].copy()
print("Shape tras seleccionar columnas:", df.shape)
df.head(3)


Shape tras seleccionar columnas: (6896317, 11)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pulocationid,dolocationid,payment_type,fare_amount,tip_amount,total_amount,congestion_surcharge
0,2019-12-01 00:26:58.000000,2019-12-01 00:41:45.000000,1.0,4.2,142.0,116.0,2.0,14.5,0.0,18.3,2.5
1,2019-12-01 00:12:08.000000,2019-12-01 00:12:14.000000,1.0,0.0,145.0,145.0,2.0,2.5,0.0,3.8,0.0
2,2019-12-01 00:25:53.000000,2019-12-01 00:26:04.000000,1.0,0.0,145.0,145.0,2.0,2.5,0.0,3.8,0.0


In [44]:
# Convertir tipos: fechas a datetime, IDs a int, numeric clean 
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

#pulocationid y dolocationid a int para poder tener Na si hay 
df['pulocationid'] = df['pulocationid'].astype('Int64', copy=False)
df['dolocationid'] = df['dolocationid'].astype('Int64', copy=False)

# passenger_count a int 
df['passenger_count'] = df['passenger_count'].astype('Int64', copy=False)

# Asegurar columnas numericas donde corresponden 
numeric_cols = ['trip_distance', 'payment_type', 'fare_amount', 'tip_amount', 'total_amount', 'congestion_surcharge']
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# Revision rapida 
print(df.dtypes)
print("\nNulos por columna (después de parse):")
print(df.isna().sum())

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   Int64
trip_distance                   float64
pulocationid                      Int64
dolocationid                      Int64
payment_type                    float64
fare_amount                     float64
tip_amount                      float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

Nulos por columna (después de parse):
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          51018
trip_distance                0
pulocationid                 0
dolocationid                 0
payment_type             51018
fare_amount                  0
tip_amount                   0
total_amount                 0
congestion_surcharge         0
dtype: int64


In [45]:
# Eliminacion de nulos y registros invalidos 
df.dropna(subset=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'total_amount'], inplace=True)

# filtrar viajes con distacia, tarifa o monto negativos o null 
df = df[
    (df['trip_distance'] > 0) &
    (df['fare_amount'] > 0) &
    (df['total_amount'] > 0)
]
print("✅ Filas después de limpieza:", df.shape[0])
df.head()

✅ Filas después de limpieza: 6803514


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pulocationid,dolocationid,payment_type,fare_amount,tip_amount,total_amount,congestion_surcharge
0,2019-12-01 00:26:58,2019-12-01 00:41:45,1,4.2,142,116,2.0,14.5,0.0,18.3,2.5
3,2019-12-01 00:12:03,2019-12-01 00:33:19,2,9.4,138,25,1.0,28.5,10.0,39.8,0.0
4,2019-12-01 00:05:27,2019-12-01 00:16:32,2,1.6,161,237,2.0,9.0,0.0,12.8,2.5
5,2019-12-01 00:58:51,2019-12-01 01:08:37,2,1.0,161,230,2.0,6.5,0.0,10.3,2.5
6,2019-12-01 00:14:19,2019-12-01 00:27:06,0,1.7,164,163,2.0,10.0,0.0,13.8,2.5


In [46]:
# Crear columna de duración del viaje en minutos
df["trip_duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

# Eliminamos viajes con duración negativa o cero
df = df[df["trip_duration_min"] > 0]

df["trip_duration_min"] = df["trip_duration_min"].round(2)

print("Duracion de viaje calculada y registros negativos eliminados.")
df[["trip_distance", "trip_duration_min"]].describe()


Duracion de viaje calculada y registros negativos eliminados.


Unnamed: 0,trip_distance,trip_duration_min
count,6803219.0,6803219.0
mean,3.017772,18.64643
std,8.323634,70.47607
min,0.01,0.02
25%,0.99,6.92
50%,1.63,11.75
75%,3.07,19.47
max,19130.18,6114.78


In [49]:
import os
BASE_DIR = os.path.dirname(os.getcwd()) 
processed_dir = os.path.join( BASE_DIR,"data", "processed")

# Asegurar que la carpeta existe (por si acaso)
os.makedirs(processed_dir, exist_ok=True)

# Ruta final del archivo CSV
output_path = os.path.join(processed_dir, "tripdata_clean.csv")

# Guardar el dataframe limpio
df.to_csv(output_path, index=False)

print(f" Datos limpios guardados correctamente en:\n{output_path}")


 Datos limpios guardados correctamente en:
c:\Users\Laptop\Desktop\nyc-taxi-demanda-2019\nyc-taxi-demanda-2019\data\processed\tripdata_clean.csv
