In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycatch22 import catch22_all 
from joblib import Parallel, delayed 
import pandas as pd

import itertools
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.covariance import LedoitWolf  
from scipy.spatial.distance import cdist
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV





In [2]:
def extract_catch22(id: str, grp: pd.DataFrame) -> pd.Series:
    values = grp.sort_values("time")["acre_rate_mass_weighted"].to_numpy()
    res = catch22_all(values)
    return pd.Series(res["values"], index=res["names"], name=id)


def compute_features(df: pd.DataFrame, n_jobs: int = -1) -> pd.DataFrame:

    grouped = df.groupby("id", sort=False)

    # Paralelización con joblib
    results = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(extract_catch22)(id, grp) for id, grp in grouped
    )

    features_df = pd.concat(results, axis=1).T
    features_df.reset_index(inplace=True)
    features_df.rename(columns={"index": "id"}, inplace=True)
    return features_df



In [3]:
all_data=pd.read_csv('data/data_csv/all_data_noisy.csv')
#all_data=pd.read_csv('data/data_csv/all_data_clean.csv')

In [4]:
features_df = compute_features(all_data, n_jobs=-1)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 179 out of 198 | elapsed:    0.8s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 198 out of 198 | elapsed:    0.8s finished


In [5]:
def standardize_features(df: pd.DataFrame):
    """
    Estandariza (media 0, varianza 1) todas las columnas numéricas
    excepto 'id'.
    """

    feature_cols = df.columns.difference(['id'])   # solo las 22 features
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[feature_cols])

    df_std = pd.DataFrame(scaled, columns=feature_cols, index=df.index)
    df_std.insert(0, 'id', df['id'])


    return df_std, scaler

In [6]:
df_standardized, fitted_scaler = standardize_features(features_df)


In [7]:
df_std = df_standardized   # o usa df_standardized en memoria
feature_cols = df_std.columns.difference(["id"])          # lista de 22 features

def compute_distances(df: pd.DataFrame, target_id: str) -> pd.DataFrame:

    X = df.loc[df["id"] != target_id, feature_cols].to_numpy()
    x0 = df.loc[df["id"] == target_id, feature_cols].to_numpy()

    euclidean  = cdist(X, x0, metric="euclidean").flatten()
    manhattan  = cdist(X, x0, metric="cityblock").flatten()
    cosine     = cdist(X, x0, metric="cosine").flatten()

    cov        = np.cov(df[feature_cols].to_numpy(), rowvar=False)
    VI         = np.linalg.inv(cov)
    mahalan    = cdist(X, x0, metric="mahalanobis", VI=VI).flatten()

    other_ids = df.loc[df["id"] != target_id, "id"].to_numpy()
    dist_df = pd.DataFrame({
        "id":        other_ids,
        "euclidean":  euclidean,
        "manhattan":  manhattan,
        "cosine":     cosine,
        "mahalanobis": mahalan,
    })

    
    dist_df[['q', 'e', 'seg']] = dist_df['id'].str.split('_', expand=True)

    dist_df['q'] = dist_df['q'].astype(int)/100
    dist_df['e'] = dist_df['e'].astype(int)/100
    dist_df.drop(columns='seg', inplace=True)
    

    return dist_df.sort_values("manhattan").reset_index(drop=True)




In [8]:

df_standardized[['q', 'e', 'seg']] = df_standardized['id'].str.split('_', expand=True)

df_standardized['q'] = df_standardized['q'].astype(int)/100
df_standardized['e'] = df_standardized['e'].astype(int)/100
df_standardized.drop(columns='seg', inplace=True)

In [9]:

#Columnas clave
id_col      = "id"
target_cols = ["q", "e"] #q,e se trabajan como variables categoricas (porque finalmente son identificadores de cada serie)
feature_cols = df_standardized.columns.difference([id_col] + target_cols)

In [10]:
#Codificar categorías
le_dict = {}
Y_encoded = pd.DataFrame(index=df_standardized.index)
for col in target_cols:
    le = LabelEncoder().fit(df_standardized[col])
    Y_encoded[col] = le.transform(df_standardized[col])
    le_dict[col] = le

X = df_standardized[feature_cols]
Y = Y_encoded[target_cols]

In [11]:
#Definir clasificador multisalida (estoy usando q,e como variabkles obj)
base_clf    = RandomForestClassifier(random_state=42, n_jobs=-1)
multi_clf   = MultiOutputClassifier(base_clf, n_jobs=-1)

#Grid para optimizar hiperparametros 
param_grid = {
    "estimator__n_estimators":     [10, 500], #cantidad de arboles
    "estimator__max_depth":        [None, 20, 40], #profundidad del arbol
    "estimator__min_samples_leaf": [1, 2, 4], 
    "estimator__max_features":     ["sqrt", "log2"]
}

grid = GridSearchCV(
    multi_clf,
    param_grid,
    cv=5,
    scoring="accuracy",  # multioutput: promedia accuracy de cada salida
    n_jobs=-1
)

In [12]:
# entrenar y extraer el mejor modelo
best_multi = grid.fit(X, Y).best_estimator_

#  Calcular importancias promediadas
#    cada estimador interno tiene feature_importances_
importances_each = np.vstack([
    est.feature_importances_
    for est in best_multi.estimators_
])    # shape = (n_outputs, n_features)

importances = importances_each.mean(axis=0)
importances /= importances.sum()    # normalizar a suma = 1

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", l

In [13]:
# Ponderar el vector caracteristicas
df_weighted = df_standardized.copy()
df_weighted[feature_cols] = df_weighted[feature_cols].mul(importances, axis=1)

In [14]:
#Distancias respecto de target
TARGET = "100_050_20"


In [15]:
distances = compute_distances(df_weighted, TARGET)


In [16]:
metric_cols = ["euclidean", "manhattan", "cosine", "mahalanobis"]
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
distances[metric_cols] = scaler.fit_transform(distances[metric_cols])

In [17]:
df_median = (
    distances
    .groupby(["q", "e"], as_index=False)[metric_cols]
    .median()
    .sort_values(["manhattan"])
)

In [18]:
def build_id(q, e):
    q_int = int(round(q * 100))  # 1.00 → 100, 0.25 → 25 …
    e_int = int(round(e * 100))  # 0.70 → 70 …
    return f"{q_int:03d}_{e_int:03d}"

df_median["id"] = df_median.apply(lambda r: build_id(r["q"], r["e"]), axis=1)

# (Opcional) Reordenar columnas para que id quede al frente
cols_order = ["id", "q", "e", "euclidean", "manhattan", "cosine", "mahalanobis"]
df_median = df_median[cols_order]


In [None]:
#criterios entrada grid
serie_target
id_cluster

intervalo ruido
intervalo periodo orbita en diaas
intervalo frecuencia observacion