In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycatch22 import catch22_all 
from joblib import Parallel, delayed 
import pandas as pd

import itertools
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.covariance import LedoitWolf  
from scipy.spatial.distance import cdist
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV





In [2]:
def extract_catch22(id: str, grp: pd.DataFrame) -> pd.Series:
    values = grp.sort_values("time")["acre_rate_mass_weighted"].to_numpy()
    res = catch22_all(values)
    return pd.Series(res["values"], index=res["names"], name=id)


def compute_features(df: pd.DataFrame, n_jobs: int = -1) -> pd.DataFrame:

    grouped = df.groupby("id", sort=False)

    # Paralelización con joblib
    results = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(extract_catch22)(id, grp) for id, grp in grouped
    )

    features_df = pd.concat(results, axis=1).T
    features_df.reset_index(inplace=True)
    features_df.rename(columns={"index": "id"}, inplace=True)
    return features_df



In [3]:
all_data=pd.read_csv('data/data_csv/all_data_noisy.csv')

In [4]:
all_data

Unnamed: 0,id,bucket,time,acre_rate_primary,acre_rate_secondary,q,e,seg,acre_rate_mass_weighted,acre_rate_mass_weighted_clean,time_days
0,010_000_03,2000,200.046532,-0.003971,-0.011159,0.1,0.0,3.0,0.004654,0.004625,60013.959459
1,010_000_03,2001,200.141060,-0.003941,-0.011797,0.1,0.0,3.0,0.004832,0.004651,60042.318018
2,010_000_03,2002,200.251149,-0.003895,-0.011635,0.1,0.0,3.0,0.004759,0.004599,60075.344837
3,010_000_03,2003,200.345299,-0.003864,-0.010908,0.1,0.0,3.0,0.004547,0.004504,60103.589706
4,010_000_03,2004,200.455242,-0.003842,-0.010054,0.1,0.0,3.0,0.004378,0.004407,60136.572515
...,...,...,...,...,...,...,...,...,...,...,...
198006,100_070_20,19996,1999.652865,-0.002382,-0.004459,1.0,0.7,20.0,0.003458,0.003398,599895.859511
198007,100_070_20,19997,1999.748086,-0.002034,-0.005470,1.0,0.7,20.0,0.003918,0.003752,599924.425813
198008,100_070_20,19998,1999.840689,-0.001769,-0.008886,1.0,0.7,20.0,0.005331,0.005328,599952.206601
198009,100_070_20,19999,1999.954098,-0.001862,-0.014619,1.0,0.7,20.0,0.008233,0.008241,599986.229356


In [5]:
features_df = compute_features(all_data, n_jobs=-1)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 198 out of 198 | elapsed:    0.6s finished


In [6]:
features_df

Unnamed: 0,id,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_FirstMin_ac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,...,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
0,010_000_03,-0.013568,0.293458,1.926317,5.0,0.060977,-0.007191,0.977978,21.0,0.001914,...,0.250000,0.420000,-0.667500,0.322748,7.0,2.085125,0.833333,0.541667,0.625864,0.981815
1,010_000_04,0.214647,-0.105600,1.324725,5.0,0.037151,-0.174670,0.962963,11.0,0.004710,...,0.250000,-0.544000,0.284000,0.226898,8.0,2.113974,0.708333,0.604167,0.625864,1.052345
2,010_000_05,0.140121,-0.123739,3.207702,5.0,0.147247,-0.083410,0.949950,20.0,0.062500,...,0.004255,-0.603000,0.566000,0.516269,7.0,2.012290,0.687500,0.541667,0.490874,0.833052
3,010_000_06,0.212590,-0.070596,0.871814,5.0,0.014081,0.083935,0.970971,10.0,0.000836,...,0.333333,0.076000,-0.011000,0.153298,7.0,2.161796,0.125000,0.208333,0.711767,1.132192
4,010_000_07,-0.150642,0.197703,0.990576,5.0,0.017894,0.001808,0.975976,11.0,0.001115,...,0.333333,-0.039000,0.030000,0.130698,7.0,2.135313,0.791667,0.416667,0.632000,1.121654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,100_070_16,-0.799790,-0.505580,1.879384,5.0,0.170299,1.036562,0.951952,8.0,0.012382,...,0.666667,0.126000,-0.122000,0.285669,10.0,1.875090,0.166667,0.875000,0.625864,1.035874
194,100_070_17,-0.930107,-0.671705,1.729500,5.0,0.261074,0.820141,0.948949,8.0,0.024319,...,0.666667,-0.176000,0.033000,0.215419,10.0,1.874531,0.125000,0.833333,0.625864,1.084346
195,100_070_18,-0.159356,-0.948044,1.997059,5.0,0.203268,0.316253,0.939940,9.0,0.005164,...,1.000000,-0.360000,0.312000,0.295741,10.0,1.899140,0.125000,0.833333,0.625864,1.014726
196,100_070_19,-0.839070,-0.561309,1.825769,5.0,0.208506,0.914739,0.961962,8.0,0.024328,...,0.666667,0.203000,-0.126000,0.270078,10.0,1.883238,0.145833,0.750000,0.625864,1.049044


In [7]:
def standardize_features(df: pd.DataFrame):
    """
    Estandariza (media 0, varianza 1) todas las columnas numéricas
    excepto 'id'.
    """

    feature_cols = df.columns.difference(['id'])   # solo las 22 features
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[feature_cols])

    df_std = pd.DataFrame(scaled, columns=feature_cols, index=df.index)
    df_std.insert(0, 'id', df['id'])


    return df_std, scaler

In [8]:
df_standardized, fitted_scaler = standardize_features(features_df)


In [9]:
df_std = df_standardized   # o usa df_standardized en memoria
feature_cols = df_std.columns.difference(["id"])          # lista de 22 features

def compute_distances(df: pd.DataFrame, target_id: str) -> pd.DataFrame:

    X = df.loc[df["id"] != target_id, feature_cols].to_numpy()
    x0 = df.loc[df["id"] == target_id, feature_cols].to_numpy()

    euclidean  = cdist(X, x0, metric="euclidean").flatten()
    manhattan  = cdist(X, x0, metric="cityblock").flatten()
    cosine     = cdist(X, x0, metric="cosine").flatten()

    cov        = np.cov(df[feature_cols].to_numpy(), rowvar=False)
    VI         = np.linalg.inv(cov)
    mahalan    = cdist(X, x0, metric="mahalanobis", VI=VI).flatten()

    other_ids = df.loc[df["id"] != target_id, "id"].to_numpy()
    dist_df = pd.DataFrame({
        "id":        other_ids,
        "euclidean":  euclidean,
        "manhattan":  manhattan,
        "cosine":     cosine,
        "mahalanobis": mahalan,
    })

    
    dist_df[['q', 'e', 'seg']] = dist_df['id'].str.split('_', expand=True)

    dist_df['q'] = dist_df['q'].astype(int)/100
    dist_df['e'] = dist_df['e'].astype(int)/100
    dist_df.drop(columns='seg', inplace=True)
    

    return dist_df.sort_values("manhattan").reset_index(drop=True)




In [10]:

df_standardized[['q', 'e', 'seg']] = df_standardized['id'].str.split('_', expand=True)

df_standardized['q'] = df_standardized['q'].astype(int)/100
df_standardized['e'] = df_standardized['e'].astype(int)/100
df_standardized.drop(columns='seg', inplace=True)

In [11]:
df_standardized

Unnamed: 0,id,CO_Embed2_Dist_tau_d_expfit_meandiff,CO_FirstMin_ac,CO_HistogramAMI_even_2_5,CO_f1ecac,CO_trev_1_num,DN_HistogramMode_10,DN_HistogramMode_5,DN_OutlierInclude_n_001_mdrmd,DN_OutlierInclude_p_001_mdrmd,...,SB_BinaryStats_diff_longstretch0,SB_BinaryStats_mean_longstretch1,SB_MotifThree_quantile_hh,SB_TransitionMatrix_3ac_sumdiagcov,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SP_Summaries_welch_rect_area_5_1,SP_Summaries_welch_rect_centroid,q,e
0,010_000_03,-1.049897,-0.500809,-1.531430,-0.380200,-0.485598,1.382847,0.563787,-2.316667,1.366234,...,-1.582700,-0.310823,1.759642,-0.768824,0.309473,1.668682,-0.857004,1.058328,0.1,0.0
1,010_000_04,-1.205747,-0.500809,-1.674043,-0.401953,-0.955647,0.505997,1.067209,0.661998,-1.371586,...,-0.676993,-0.453086,1.944721,-0.644802,0.557218,1.169213,-1.203247,1.058328,0.1,0.0
2,010_000_05,-0.938350,-0.500809,-1.015039,-0.333867,-0.699516,0.466141,0.902811,1.544798,-1.539150,...,-1.582700,-0.325050,1.292380,1.918919,0.309473,1.085968,-0.157931,0.569247,0.1,0.0
3,010_000_06,-1.209732,-0.500809,-1.812135,-0.418330,-0.229843,0.582913,1.062670,-0.261498,0.389253,...,-1.582700,-0.467313,2.251519,-0.816659,-1.011834,-1.161645,-1.469117,1.369561,0.1,0.0
4,010_000_07,-1.220732,-0.500809,-1.789316,-0.414035,-0.460343,1.172444,0.261416,-0.133147,0.062645,...,-1.582700,-0.453086,2.081618,-0.804257,-0.186017,1.502193,-1.550759,1.080559,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,100_070_16,-1.317425,-0.500809,-0.877052,-0.381897,2.443808,-0.372877,-1.170543,-0.608983,0.531256,...,1.134421,-0.495765,0.412188,-0.304447,1.630780,-0.995155,-0.990944,1.058328,1.0,0.7
194,100_070_17,-0.952685,-0.500809,-0.333695,-0.387317,1.836398,-0.737902,-1.458010,-0.123756,-0.326443,...,1.134421,-0.495765,0.408600,0.225101,1.465617,-1.161645,-1.244715,1.058328,1.0,0.7
195,100_070_18,-0.528041,-0.500809,-0.679709,-0.377642,0.422183,-1.345102,0.242192,0.749652,-0.849015,...,1.134421,-0.481539,0.566477,-0.624630,1.465617,-1.161645,-0.954561,1.058328,1.0,0.7
196,100_070_19,-1.279441,-0.500809,-0.648352,-0.383836,2.101896,-0.495330,-1.257191,-0.621505,0.749941,...,1.134421,-0.495765,0.464458,0.225501,1.135290,-1.078400,-1.047266,1.058328,1.0,0.7


In [12]:

#Columnas clave
id_col      = "id"
target_cols = ["q", "e"] #q,e se trabajan como variables categoricas (porque finalmente son identificadores de cada serie)
feature_cols = df_standardized.columns.difference([id_col] + target_cols)

In [13]:
#Codificar categorías
le_dict = {}
Y_encoded = pd.DataFrame(index=df_standardized.index)
for col in target_cols:
    le = LabelEncoder().fit(df_standardized[col])
    Y_encoded[col] = le.transform(df_standardized[col])
    le_dict[col] = le

X = df_standardized[feature_cols]
Y = Y_encoded[target_cols]

In [14]:
#Definir clasificador multisalida (estoy usando q,e como variabkles obj)
base_clf    = RandomForestClassifier(random_state=42, n_jobs=-1)
multi_clf   = MultiOutputClassifier(base_clf, n_jobs=-1)

#Grid para optimizar hiperparametros 
param_grid = {
    "estimator__n_estimators":     [200, 500], #cantidad de arboles
    "estimator__max_depth":        [None, 20, 40], #profundidad del arbol
    "estimator__min_samples_leaf": [1, 2, 4], 
    "estimator__max_features":     ["sqrt", "log2"]
}

grid = GridSearchCV(
    multi_clf,
    param_grid,
    cv=5,
    scoring="accuracy",  # multioutput: promedia accuracy de cada salida
    n_jobs=-1
)

In [15]:
# entrenar y extraer el mejor modelo
best_multi = grid.fit(X, Y).best_estimator_

#  Calcular importancias promediadas
#    cada estimador interno tiene feature_importances_
importances_each = np.vstack([
    est.feature_importances_
    for est in best_multi.estimators_
])    # shape = (n_outputs, n_features)

importances = importances_each.mean(axis=0)
importances /= importances.sum()    # normalizar a suma = 1

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", l

In [16]:
# Ponderar el vector caracteristicas
df_weighted = df_standardized.copy()
df_weighted[feature_cols] = df_weighted[feature_cols].mul(importances, axis=1)

In [17]:
importances #ninguna feature es significativamente mas importante que el resto

array([0.05326481, 0.06963164, 0.06092498, 0.0782118 , 0.07276397,
       0.05709831, 0.01839532, 0.00997295, 0.01069043, 0.05792015,
       0.05077406, 0.07491982, 0.03516332, 0.02990096, 0.01684975,
       0.02227226, 0.11011847, 0.05342501, 0.01170445, 0.01725208,
       0.03752095, 0.0512245 ])

In [18]:
df_weighted

Unnamed: 0,id,CO_Embed2_Dist_tau_d_expfit_meandiff,CO_FirstMin_ac,CO_HistogramAMI_even_2_5,CO_f1ecac,CO_trev_1_num,DN_HistogramMode_10,DN_HistogramMode_5,DN_OutlierInclude_n_001_mdrmd,DN_OutlierInclude_p_001_mdrmd,...,SB_BinaryStats_diff_longstretch0,SB_BinaryStats_mean_longstretch1,SB_MotifThree_quantile_hh,SB_TransitionMatrix_3ac_sumdiagcov,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SP_Summaries_welch_rect_area_5_1,SP_Summaries_welch_rect_centroid,q,e
0,010_000_03,-0.055923,-0.034872,-0.093302,-0.029736,-0.035334,0.078958,0.010371,-0.023104,0.014606,...,-0.026668,-0.006923,0.193769,-0.041074,0.003622,0.028788,-0.032156,0.054212,0.1,0.0
1,010_000_04,-0.064224,-0.034872,-0.101991,-0.031437,-0.069537,0.028892,0.019632,0.006602,-0.014663,...,-0.011407,-0.010091,0.214150,-0.034449,0.006522,0.020171,-0.045147,0.054212,0.1,0.0
2,010_000_05,-0.049981,-0.034872,-0.061841,-0.026112,-0.050900,0.026616,0.016607,0.015406,-0.016454,...,-0.026668,-0.007240,0.142315,0.102518,0.003622,0.018735,-0.005926,0.029159,0.1,0.0
3,010_000_06,-0.064436,-0.034872,-0.110404,-0.032718,-0.016724,0.033283,0.019548,-0.002608,0.004161,...,-0.026668,-0.010408,0.247934,-0.043630,-0.011843,-0.020041,-0.055123,0.070155,0.1,0.0
4,010_000_07,-0.065022,-0.034872,-0.109014,-0.032382,-0.033496,0.066945,0.004809,-0.001328,0.000670,...,-0.026668,-0.010091,0.229225,-0.042967,-0.002177,0.025916,-0.058186,0.055351,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,100_070_16,-0.070172,-0.034872,-0.053434,-0.029869,0.177821,-0.021291,-0.021533,-0.006073,0.005679,...,0.019115,-0.011042,0.045390,-0.016265,0.019087,-0.017168,-0.037181,0.054212,1.0,0.7
194,100_070_17,-0.050745,-0.034872,-0.020330,-0.030293,0.133624,-0.042133,-0.026821,-0.001234,-0.003490,...,0.019115,-0.011042,0.044994,0.012026,0.017154,-0.020041,-0.046703,0.054212,1.0,0.7
195,100_070_18,-0.028126,-0.034872,-0.041411,-0.029536,0.030720,-0.076803,0.004455,0.007476,-0.009076,...,0.019115,-0.010725,0.062380,-0.033371,0.017154,-0.020041,-0.035816,0.054212,1.0,0.7
196,100_070_19,-0.068149,-0.034872,-0.039501,-0.030020,0.152942,-0.028282,-0.023126,-0.006198,0.008017,...,0.019115,-0.011042,0.051145,0.012047,0.013288,-0.018605,-0.039294,0.054212,1.0,0.7


In [19]:
#Distancias respecto de target
TARGET = "100_050_20"


In [20]:
distances = compute_distances(df_weighted, TARGET)


In [21]:
metric_cols = ["euclidean", "manhattan", "cosine", "mahalanobis"]
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
distances[metric_cols] = scaler.fit_transform(distances[metric_cols])

In [22]:
df_median = (
    distances
    .groupby(["q", "e"], as_index=False)[metric_cols]
    .median()
    .sort_values(["manhattan"])
)

In [23]:
def build_id(q, e):
    q_int = int(round(q * 100))  # 1.00 → 100, 0.25 → 25 …
    e_int = int(round(e * 100))  # 0.70 → 70 …
    return f"{q_int:03d}_{e_int:03d}"

df_median["id"] = df_median.apply(lambda r: build_id(r["q"], r["e"]), axis=1)

# (Opcional) Reordenar columnas para que id quede al frente
cols_order = ["id", "q", "e", "euclidean", "manhattan", "cosine", "mahalanobis"]
df_median = df_median[cols_order]


In [24]:
df_median

Unnamed: 0,id,q,e,euclidean,manhattan,cosine,mahalanobis
8,100_050,1.0,0.5,0.038733,0.045493,0.024621,0.180536
9,100_060,1.0,0.6,0.070803,0.067689,0.055431,0.186187
10,100_070,1.0,0.7,0.180008,0.141579,0.181099,0.292698
7,100_040,1.0,0.4,0.168043,0.193188,0.246224,0.331565
0,010_000,0.1,0.0,0.312587,0.279188,0.300805,0.27936
5,100_020,1.0,0.2,0.308885,0.334441,0.807887,0.237622
1,025_000,0.25,0.0,0.325686,0.3545,0.755106,0.358336
6,100_030,1.0,0.3,0.351123,0.398575,0.820281,0.372146
2,050_000,0.5,0.0,0.469741,0.533045,0.933293,0.240056
3,100_000,1.0,0.0,0.528294,0.612318,0.923318,0.316238


In [25]:


def evaluar_exito(id_sim: str,
                  distances: pd.DataFrame,
                  metric: str = "manhattan") -> bool:
    """
    Devuelve True si `id_sim` es el registro más cercano al TARGET
    según la métrica indicada; False en caso contrario.
    """
    # Validaciones mínimas
    if id_sim not in distances["id"].values:
        raise ValueError(f"El id_sim «{id_sim}» no está en el DataFrame.")
    if metric not in distances.columns:
        raise ValueError(f"La métrica «{metric}» no existe en el DataFrame.")

    # ID con menor distancia (fila del propio TARGET está excluida del DataFrame)
    closest_id = distances.loc[distances[metric].idxmin(), "id"]
    return id_sim == closest_id


In [26]:
exito = evaluar_exito("100_050", df_median, metric="manhattan")
print(exito)   # True  → éxito; False → fracaso


True
