# Construcción de la base de datos

In [7]:
import numpy as np
import pandas as pd
import pickle
import scipy.stats as stats
import matplotlib.pyplot as plt

In [8]:
fecha_ref = pd.Timestamp('2025-01-01 00:00:00').floor('d')

In [9]:
with open("../procData/muestras_ovul_horas_norm1.pkl", "rb") as f:
    muestras_ovul = pickle.load(f)

### Filtrar series con alta desviación

In [10]:
desviaciones = []

for key, datos in muestras_ovul.items():
    df = datos["serie"]
    std = df["result"].std()
    desviaciones.append((key, std))

# DataFrame para análisis
df_std = pd.DataFrame(desviaciones, columns=["serie_id", "std"])

# Umbrales usando IQR
q1 = df_std["std"].quantile(0.25)
q3 = df_std["std"].quantile(0.75)
iqr = q3 - q1
lim_sup = q3 + 1.5 * iqr

# Opción alternativa más restrictiva:
# lim_inf = df_std["std"].mean() - 2 * df_std["std"].std()
# lim_sup = df_std["std"].mean() + 2 * df_std["std"].std()

# Filtrar series dentro del rango
series_filtradas_std = df_std[(df_std["std"] <= lim_sup)]["serie_id"].tolist()

In [11]:
# Dataset limpio: solo series con desviación típica aceptable
muestras_ovul_filtrado = {}

for serie_id in series_filtradas_std:
    if serie_id in muestras_ovul:
        muestras_ovul_filtrado[serie_id] = muestras_ovul[serie_id]


print(f"Total original de series: {len(muestras_ovul)}")
print(f"Total después del filtrado por desviación típica: {len(muestras_ovul_filtrado)}")

Total original de series: 59
Total después del filtrado por desviación típica: 58


### Series con medias de los intervalos máximos

In [23]:
intervalo_dias = [-3, 4]  # Incluye ambos extremos, 8 días
intervalo_horas_max = [21, 3] 
intervalo_horas_min = [7, 15]  

num_dias = intervalo_dias[1] - intervalo_dias[0] + 1  # De -3 a 4 = 8 días

In [24]:
muestras_max = {}

for serie_id, datos in muestras_ovul_filtrado.items():
    df = datos["serie"].copy()

    df["dias_rel"] = (df["resultTimestamp"] - fecha_ref) / pd.Timedelta(days=1)
    df["hora"] = df["resultTimestamp"].dt.hour
    df["fecha"] = df["resultTimestamp"].dt.date

    # Filtro intervalo horario (considera cruce de medianoche)
    if intervalo_horas_max[0] > intervalo_horas_max[1]:
        filtro_max = (df["hora"] >= intervalo_horas_max[0]) | (df["hora"] <= intervalo_horas_max[1])
    else:
        filtro_max = (df["hora"] >= intervalo_horas_max[0]) & (df["hora"] <= intervalo_horas_max[1])
    df_filtrado_max = df[filtro_max].copy()

    # Ajuste para horas después de medianoche (asignar al día anterior)
    df_filtrado_max.loc[df_filtrado_max["hora"] <= intervalo_horas_max[0], "dias_rel"] -= 1

    df_filtrado_max["dia_entero"] = np.floor(df_filtrado_max["dias_rel"]).astype(int) + 1

    medias_intervalo = df_filtrado_max.groupby("dia_entero")["result"].mean()

    medias_intervalo = medias_intervalo.reindex(range(intervalo_dias[0], intervalo_dias[1] + 1))

    muestras_max[serie_id] = medias_intervalo.reset_index(drop=True)

In [25]:
print(len(muestras_max))

longitudes = [len(datos) for datos in muestras_max.values()]
all(l == num_dias for l in longitudes)

58


True

In [26]:
X_max = []

for serie_id, datos in muestras_max.items():
    valores = datos.values
    X_max.append(valores)


columnas = [f"t_{i}" for i in range(0, num_dias)] 

df_series_max = pd.DataFrame(X_max, columns=columnas)

In [30]:
muestras_min = {}

for serie_id, datos in muestras_ovul_filtrado.items():
    df = datos["serie"].copy()

    df["dias_rel"] = (df["resultTimestamp"] - fecha_ref) / pd.Timedelta(days=1)
    df["hora"] = df["resultTimestamp"].dt.hour
    df["fecha"] = df["resultTimestamp"].dt.date

    # Filtro intervalo horario (considera cruce de medianoche)
    if intervalo_horas_min[0] > intervalo_horas_min[1]:
        filtro_min = (df["hora"] >= intervalo_horas_min[0]) | (df["hora"] <= intervalo_horas_min[1])
    else:
        filtro_min = (df["hora"] >= intervalo_horas_min[0]) & (df["hora"] <= intervalo_horas_min[1])
    df_filtrado_min = df[filtro_min].copy()

    # Ajuste para horas después de medianoche (asignar al día anterior)
    #df_filtrado_min.loc[df_filtrado_min["hora"] <= intervalo_horas_max[0], "dias_rel"] -= 1 # En este caso no hace nada

    df_filtrado_min["dia_entero"] = np.floor(df_filtrado_min["dias_rel"]).astype(int)

    medias_intervalo = df_filtrado_min.groupby("dia_entero")["result"].mean()

    medias_intervalo = medias_intervalo.reindex(range(intervalo_dias[0], intervalo_dias[1] + 1))

    muestras_min[serie_id] = medias_intervalo.reset_index(drop=True)

In [31]:
print(len(muestras_min))

longitudes = [len(datos) for datos in muestras_min.values()]
all(l == num_dias for l in longitudes)

58


True

In [33]:
muestras_mean = {}

for serie_id, datos in muestras_ovul_filtrado.items():
    df = datos["serie"].copy()

    df["dias_rel"] = (df["resultTimestamp"] - fecha_ref) / pd.Timedelta(days=1)
    df["hora"] = df["resultTimestamp"].dt.hour
    df["fecha"] = df["resultTimestamp"].dt.date

    df["dia_entero"] = np.floor(df["dias_rel"]).astype(int)
    medias_diarias = df.groupby("dia_entero")["result"].mean()
    medias_diarias = medias_diarias.reindex(range(intervalo_dias[0], intervalo_dias[1] + 1))
    muestras_mean[serie_id] = medias_diarias.reset_index(drop=True)

In [34]:
print(len(muestras_mean))

longitudes = [len(datos) for datos in muestras_mean.values()]
all(l == num_dias for l in longitudes)

58


True

In [36]:
X_max_min = []

for serie_id in muestras_max.keys():
    max_vals = muestras_max[serie_id].values
    min_vals = muestras_min[serie_id].values

    # Asegúrate de que ambas tienen la misma longitud
    assert len(max_vals) == len(min_vals) == num_dias, f"Longitudes incompatibles en {serie_id}"

    # Alternar máximo y mínimo: [max_0, min_0, max_1, min_1, ..., max_7, min_7]
    combinado = np.empty(2 * num_dias)
    combinado[0::2] = max_vals  # posiciones pares (0, 2, 4...) → máximos
    combinado[1::2] = min_vals  # posiciones impares (1, 3, 5...) → mínimos

    X_max_min.append(combinado)

# Nombres de columnas: t_0_max, t_0_min, ..., t_7_max, t_7_min
columnas_max_min = [f"t_{i}_{tipo}" for i in range(num_dias) for tipo in ("max", "min")]

df_series_max_min = pd.DataFrame(X_max_min, columns=columnas_max_min)

In [40]:
X_max_mean_min = []

for serie_id in muestras_max.keys():
    max_vals = muestras_max[serie_id].values
    mean_vals = muestras_mean[serie_id].values
    min_vals = muestras_min[serie_id].values

    # Asegúrate de que ambas tienen la misma longitud
    assert len(max_vals) == len(min_vals) == len(mean_vals)== num_dias, f"Longitudes incompatibles en {serie_id}"

    # Alternar máximo y mínimo: [max_0, min_0, max_1, min_1, ..., max_7, min_7]
    combinado = np.empty(3 * num_dias)
    combinado[0::3] = max_vals
    combinado[1::3] = mean_vals 
    combinado[2::3] = min_vals 

    X_max_mean_min.append(combinado)

# Nombres de columnas: t_0_max, t_0_min, ..., t_7_max, t_7_min
columnas_max_mean_min = [f"t_{i}_{tipo}" for i in range(num_dias) for tipo in ("max", "mean", "min")]
_
df_series_max_mean_min = pd.DataFrame(X_max_mean_min, columns=columnas_max_mean_min)

In [41]:
df_series_max.to_csv("../procData/dataset_medias_tramos_max.csv", index=False)

In [42]:
df_series_max_min.to_csv("../procData/dataset_medias_tramos_max_min.csv", index=False)

In [43]:
df_series_max_mean_min.to_csv("../procData/dataset_medias_tramos_max_mean_min.csv", index=False)

In [44]:
df_series_max_mean_min

Unnamed: 0,t_0_max,t_0_mean,t_0_min,t_1_max,t_1_mean,t_1_min,t_2_max,t_2_mean,t_2_min,t_3_max,...,t_4_min,t_5_max,t_5_mean,t_5_min,t_6_max,t_6_mean,t_6_min,t_7_max,t_7_mean,t_7_min
0,36.485749,36.437823,36.425615,36.51925,36.481384,36.523837,36.457202,36.469614,36.508095,36.45244,...,36.462541,36.539514,36.52537,36.513251,36.496142,36.476345,36.51674,36.510173,36.524315,36.573287
1,36.339502,36.362292,36.363716,36.384851,36.419756,36.406998,36.547697,36.512627,36.526812,36.523363,...,36.524405,36.518347,36.509402,36.511216,36.743067,36.616454,36.4226,36.845841,36.634804,36.497488
2,36.475772,36.474806,36.481283,36.177718,36.361392,36.440487,36.457616,36.538347,36.560319,36.483683,...,36.548785,36.36115,36.464504,36.539195,36.438655,36.481699,36.486158,36.447241,36.422082,36.423226
3,36.479153,36.5085,36.516249,36.516234,36.519155,36.518516,36.503842,36.439441,36.41255,36.359785,...,36.338142,36.469466,36.492569,36.412621,36.519807,36.562129,36.554111,36.434522,36.421107,36.356963
4,36.528132,36.475634,36.390464,36.542202,36.46829,36.442071,36.52173,36.516484,36.486755,36.478347,...,36.38508,36.470273,36.47328,36.429409,36.630829,36.558206,36.485783,36.595371,36.504007,36.439015
5,36.422131,36.426074,36.41255,36.561318,36.471668,36.245874,36.549617,36.399469,36.139875,36.575016,...,36.574897,36.617319,36.478576,36.362283,36.628377,36.556601,36.546577,36.65643,36.457743,36.408776
6,36.428263,36.525925,36.45211,36.534097,36.416507,36.282429,36.53319,36.427206,36.434485,36.40603,...,36.543868,36.647397,36.537087,36.51872,36.494338,36.504899,36.334284,36.737086,36.534398,36.409451
7,36.39406,36.392178,36.368786,36.468907,36.477179,36.461015,36.482678,36.423165,36.414259,36.519848,...,36.460322,36.578102,36.499317,36.428815,36.66243,36.622951,36.561532,36.881985,36.610963,36.463651
8,36.538649,36.474216,36.485022,36.419012,36.44739,36.463577,36.514418,36.559181,36.561524,36.655491,...,36.663773,36.625085,36.557191,36.652155,36.378185,36.310721,36.262204,36.376624,36.42929,36.366887
9,36.550529,36.406368,36.278398,36.449364,36.392118,36.36504,36.57403,36.430376,36.304181,36.588392,...,36.567933,36.555637,36.533745,36.55327,36.526735,36.528872,36.481864,36.777856,36.82648,36.738272
