In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# --- 0. path to data files ---
files = {
    "Z1": "Z1_CAJICA_ambient-weather-20250604-20251104.csv",
    "Z2": "Z2_GIRALDA_ambient-weather-20250604-20251104.csv",
    "Z3": "Z3_OIKOS_ambient-weather-20250604-20251104.csv"
}

In [None]:
# --- 1. load and inspect data ---
def load_sample(path, nrows=0):
    if nrows>0:
        return pd.read_csv(path, nrows=nrows)
    return pd.read_csv(path)

# quick inspect
for k,p in files.items():
    df = load_sample(p, nrows=5)
    print(k, df.columns.tolist())

In [None]:
# --- 2. standardize column names ---
def clean_colnames(df):
    df = df.rename(columns=lambda x: x.strip().lower().replace(" ", "_").replace("(", "").replace(")", "").replace("%","pct"))
    return df

In [None]:
# --- 3. full load and parse dates ---
def load_and_prepare(path):
    df = pd.read_csv(path)
    df = clean_colnames(df)

    # Identifica qual coluna de data existe
    date_col = "simple_date" if "simple_date" in df.columns else "date"

    # Converte pra datetime
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce", utc=True)

    # Renomeia a coluna escolhida para 'date'
    df.rename(columns={date_col: "date"}, inplace=True)

    # Remove a outra, se existir
    for col in ["simple_date", "date"]:
        if col != "date" and col in df.columns:
            df.drop(columns=col, inplace=True)

    # Define o índice e ordena
    df = df.set_index("date").sort_index()

    return df

In [None]:
df1 = load_and_prepare(files["Z1"])
df2 = load_and_prepare(files["Z2"])
df3 = load_and_prepare(files["Z3"])

In [None]:
set1, set2, set3 = set(df1.columns), set(df2.columns), set(df3.columns)

print("Só no Z1 vs Z2:", set1 - set2)
print("Só no Z2 vs Z1:", set2 - set1)
print("Só no Z1 vs Z3:", set1 - set3)
print("Só no Z3 vs Z1:", set3 - set1)
print("Só no Z2 vs Z3:", set2 - set3)
print("Só no Z3 vs Z2:", set3 - set2)

In [None]:
common_cols = set1 & set2 & set3
print("Comuns a todos:", common_cols)

In [None]:
# --- 4. harmonizar colunas entre os dois (mapear nomes equivalentes)
# Exemplo: padronizar nomes comuns (ajuste conforme suas colunas)
rename_map = {
    "outdoor_temperature_°c": "temp_c",
    "feels_like_°c": "feels_like_c",
    "humidity_%": "humidity_pct",
    "wind_speed_m/sec": "wind_speed_m_s",
    "daily_rain_mm": "daily_rain_mm",
    "absolute_pressure_mmhg": "abs_pressure_mmhg",
    "relative_pressure_mmhg": "rel_pressure_mmhg",
    "solar_radiation_w/m^2": "solar_w_m2",
    "wind_direction_°": "wind_dir_deg",
    "rain_rate_mm/hr": "rain_rate_mm_h",
    "wind_gust_m/sec": "wind_gust_m_s",
    "max_wind_speed_m/sec": "max_wind_speed_m_s",
    "max_daily_gust_m/sec": "max_daily_gust_m_s",
    "simple_date": "date"
}

df1.rename(columns=rename_map, inplace=True)
df2.rename(columns=rename_map, inplace=True)
df3.rename(columns=rename_map, inplace=True)

In [None]:
cols_miss_1 = set(df1.columns) - set(rename_map.values())
cols_miss_2 = set(df2.columns) - set(rename_map.values())
cols_miss_3 = set(df3.columns) - set(rename_map.values())

print("Colunas não mapeadas no Z1:", cols_miss_1)
print("Colunas não mapeadas no Z2:", cols_miss_2)
print("Colunas não mapeadas no Z3:", cols_miss_3)

In [None]:
all_cols = set(df1.columns) | set(df2.columns) | set(df3.columns)
cols_missing = all_cols - set(rename_map.values())
print("Colunas não mapeadas em nenhum dataset:", cols_missing)

In [None]:
common_cols = sorted(set(df1.columns) | set(df2.columns) | set(df3.columns))
df1 = df1.reindex(columns=common_cols)
df2 = df2.reindex(columns=common_cols)
df3 = df3.reindex(columns=common_cols)

In [None]:
df3.info()

In [None]:
# insert a 'zone' column (set as first column) so each dataframe carries its zone id
df1.insert(0, "zone", 1)
df2.insert(0, "zone", 2)
df3.insert(0, "zone", 3)

df = pd.concat([df1, df2, df3])

In [None]:
df.info()

In [None]:
df.drop(columns=['co2_battery', 'yearly_rain_mm'], errors='ignore', inplace=True)

In [None]:
df.info()

In [None]:
def to_numeric_cols(df):
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].str.strip().replace({'-':np.nan, '--':np.nan, '':np.nan})
        df[c] = pd.to_numeric(df[c], errors="ignore")
    return df

df = to_numeric_cols(df)

In [None]:
df.info()

In [None]:
def mark_outliers_iqr(df, col, k=1.5):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    low = q1 - k*iqr
    high = q3 + k*iqr
    return df[(df[col] < low) | (df[col] > high)]

In [346]:
# save including the index (useful if index is zone/date MultiIndex)
df.to_csv("zones_combined_cleaned.csv", index=True)