In [None]:
# travel_features_rich_generator_v3.ipynb
# Gera travel_features_rich.csv com features extras (distância, tipo de usuário, flags de clima)

import pandas as pd
import numpy as np

# ===== Função para distância Haversine (em km) =====
def haversine_km(lat1, lon1, lat2, lon2):
    """
    Calcula a distância em km entre dois pontos (lat, lon) em graus.
    """
    R = 6371.0  # raio da Terra em km

    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)

    a = (
        np.sin(dlat / 2.0) ** 2
        + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0) ** 2
    )
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# === Carregar e unir os dados de viagem === #
df1 = pd.read_csv("data/travels/travels_1.csv", low_memory=False)
df2 = pd.read_csv("data/travels/travels_2.csv", low_memory=False)
travels = pd.concat([df1, df2])
print(f"Viagens brutas: {len(travels)}")

# Converter timestamps e calcular duração da viagem (min)
travels["started_at"] = pd.to_datetime(travels["started_at"])
travels["ended_at"] = pd.to_datetime(travels["ended_at"])
travels["duration_min"] = (
    travels["ended_at"] - travels["started_at"]
).dt.total_seconds() / 60

# Filtrar viagens muito curtas ou muito longas (1 a 60 min)
travels = travels[(travels["duration_min"] > 1) & (travels["duration_min"] < 60)]
print(f"Viagens após filtro de duração (1–60 min): {len(travels)}")

# Extrair hora, dia da semana e data
travels["hour"] = travels["started_at"].dt.hour
travels["weekday"] = travels["started_at"].dt.weekday
travels["date"] = travels["started_at"].dt.date

# ===== Features espaciais e de usuário =====
# Distância em linha reta (km)
if {"start_lat", "start_lng", "end_lat", "end_lng"}.issubset(travels.columns):
    travels["distance_km"] = haversine_km(
        travels["start_lat"],
        travels["start_lng"],
        travels["end_lat"],
        travels["end_lng"],
    )
else:
    print("⚠️ Colunas de coordenadas não encontradas. distance_km não será criada.")
    travels["distance_km"] = np.nan

# Manter rideable_type e member_casual como estão (categóricas para o CVAE)
# Se não existirem, cria colunas vazias (para evitar erro).
for col in ["rideable_type", "member_casual"]:
    if col not in travels.columns:
        travels[col] = "unknown"

# ===== Carregar dados de clima =====
weather = pd.read_csv("data/weather/weather.csv")
weather["datetime"] = pd.to_datetime(weather["datetime"])
weather["date"] = weather["datetime"].dt.date
weather["hour"] = weather["datetime"].dt.hour
print(f"Registros de clima (horários): {len(weather)}")

# Selecionar variáveis expandidas relevantes (sem severerisk, sem stations)
selected_features = [
    "date",
    "hour",
    "temp",
    "feelslike",
    "dew",
    "humidity",
    "precip",
    "precipprob",
    "preciptype",
    "snow",
    "snowdepth",
    "windgust",
    "windspeed",
    "winddir",
    "sealevelpressure",
    "cloudcover",
    "visibility",
    "solarradiation",
    "solarenergy",
    "uvindex",
    "conditions",
    "icon",
]
weather_filtered = weather[selected_features].copy()

# ==== Tratamento de NaNs em clima ANTES do merge ==== #
# Categóricas: NaN vira 'none'
for col in ["preciptype", "conditions", "icon"]:
    if col in weather_filtered.columns:
        weather_filtered[col] = weather_filtered[col].fillna("none")

# Numéricas que fazem sentido como 0 quando ausentes (intensidades/quantidades)
fill_zero_cols = ["snow", "snowdepth", "windgust", "solarradiation", "solarenergy", "precipprob"]
for col in fill_zero_cols:
    if col in weather_filtered.columns:
        weather_filtered[col] = weather_filtered[col].fillna(0)

# Outras numéricas: preenche com média se tiver poucos NaNs
mean_fill_cols = [
    "feelslike",
    "dew",
    "winddir",
    "sealevelpressure",
    "cloudcover",
    "visibility",
    "uvindex",
    "temp",
    "humidity",
    "precip",
    "windspeed",
]
for col in mean_fill_cols:
    if col in weather_filtered.columns:
        if weather_filtered[col].isna().any():
            weather_filtered[col] = weather_filtered[col].fillna(
                weather_filtered[col].mean()
            )

# ===== Merge viagens + clima =====
df_merged = pd.merge(travels, weather_filtered, on=["date", "hour"], how="left")
print(f"Linhas após merge viagens+clima: {len(df_merged)}")

# Linhas onde clima essencial ainda está faltando
essential_cols = ["temp", "humidity", "precip", "windspeed"]
missing_essential_mask = df_merged[essential_cols].isna().any(axis=1)
n_missing_essential = missing_essential_mask.sum()
print(
    f"Viagens sem clima essencial (sem match ou dado muito faltante): {n_missing_essential}"
)

df_merged_clean = df_merged[~missing_essential_mask].copy()
print(f"Linhas após remover sem clima essencial: {len(df_merged_clean)}")

# ===== Flags derivadas do clima e tempo =====
# is_rainy: se precip > 0 OU preciptype contém 'rain'
df_merged_clean["is_rainy"] = (
    (df_merged_clean["precip"] > 0)
    | df_merged_clean["preciptype"].str.contains("rain", case=False, na=False)
).astype(int)

# is_snowy: se snow > 0 OU preciptype contém 'snow'
df_merged_clean["is_snowy"] = (
    (df_merged_clean["snow"] > 0)
    | df_merged_clean["preciptype"].str.contains("snow", case=False, na=False)
).astype(int)

# is_peak_hour: pico (7–10h ou 16–19h)
df_merged_clean["is_peak_hour"] = df_merged_clean["hour"].isin(
    [7, 8, 9, 10, 16, 17, 18, 19]
).astype(int)

# ===== Remover linhas com distance_km NaN (se houver) =====
if df_merged_clean["distance_km"].isna().any():
    n_nan_dist = df_merged_clean["distance_km"].isna().sum()
    print(f"Viagens com distance_km NaN: {n_nan_dist}")
    df_merged_clean = df_merged_clean[~df_merged_clean["distance_km"].isna()].copy()
    print(f"Linhas após remover distance_km NaN: {len(df_merged_clean)}")

# ===== Selecionar features finais =====
df_rich = df_merged_clean[
    [
        "duration_min",
        "hour",
        "weekday",
        "distance_km",
        "rideable_type",
        "member_casual",
        "is_peak_hour",
        "is_rainy",
        "is_snowy",
        "temp",
        "feelslike",
        "dew",
        "humidity",
        "precip",
        "precipprob",
        "preciptype",
        "snow",
        "snowdepth",
        "windgust",
        "windspeed",
        "winddir",
        "sealevelpressure",
        "cloudcover",
        "visibility",
        "solarradiation",
        "solarenergy",
        "uvindex",
        "conditions",
        "icon",
    ]
].copy()

# Checagem final de NaNs
total_nans = df_rich.isna().sum().sum()
print(f"NaNs restantes em df_rich (total de células): {total_nans}")

# Salvar CSV final
output_path = "data/processed/travel_features_rich.csv"
df_rich.to_csv(output_path, index=False)
print("✅ travel_features_rich.csv salvo com sucesso em:", output_path)

# Visualizar amostra
print(df_rich.head())
