# Importar librerías y definir funciones auxiliares

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense



# 1. Carga y limpieza

In [3]:
df = pd.read_csv('Insumos/consolidated_data.csv', low_memory=False)

# Normalizar nombres
date_col   = [c for c in df.columns if 'fecha' in c.lower() or 'date' in c.lower()][0]
client_col = [c for c in df.columns if 'cliente' in c.lower() or 'client' in c.lower()][0]
pres_col   = [c for c in df.columns if 'pres' in c.lower()][0]
temp_col   = [c for c in df.columns if 'temp' in c.lower()][0]
vol_col    = [c for c in df.columns if 'vol' in c.lower() and 'volu' in c.lower()][0]

df[date_col] = pd.to_datetime(df[date_col])
for col in (pres_col, temp_col, vol_col):
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.rename(columns={
    date_col: 'Fecha',
    client_col: 'Cliente',
    pres_col: 'Presión',
    temp_col: 'Temperatura',
    vol_col: 'Volumen'
})

# Eliminar duplicados y imputar
df = df.sort_values(['Cliente','Fecha']).drop_duplicates(['Cliente','Fecha'])
frames = []
for client, grp in df.groupby('Cliente'):
    g = grp.set_index('Fecha').sort_index()
    for var in ['Presión','Temperatura','Volumen']:
        g[var] = g[var].interpolate(method='time')
    frames.append(g.reset_index())
df = pd.concat(frames, ignore_index=True)

# Variables temporales y estandarizar
df['hour']      = df['Fecha'].dt.hour
df['dayofweek'] = df['Fecha'].dt.dayofweek
df['month']     = df['Fecha'].dt.month
features = ['Presión','Temperatura','Volumen','hour','dayofweek','month']
df[features] = StandardScaler().fit_transform(df[features])

# —————————————————————————————————————————————
# 2. Segmentación de clientes
# —————————————————————————————————————————————
stats = df.groupby('Cliente')[['Presión','Temperatura','Volumen']].agg(['mean','std'])
stats.columns = ['_'.join(c) for c in stats.columns]
stats_scaled = StandardScaler().fit_transform(stats)
stats['segment'] = KMeans(n_clusters=2, random_state=42).fit_predict(stats_scaled)
#df = df.merge(stats['segment'], left_on='Cliente', right_index=True)





# 2. Segmentación de clientes con KMeans

In [20]:
# —————————————————————————————————————————————
# 3. Evaluación por segmento con modelo temporal
# —————————————————————————————————————————————
results = []
lags = [1,2,3]  # usar 3 lags para capturar dependencia temporal

for seg in range(1):
    df_seg = df.sort_values('Fecha')
    # crear lags
    for lag in lags:
        for var in ['Presión','Temperatura','Volumen']:
            df_seg[f'{var}_lag{lag}'] = df_seg.groupby('Cliente')[var].shift(lag)
    df_seg = df_seg.dropna().reset_index(drop=True)
    
    # split 70/30
    idx = int(len(df_seg)*0.7)
    train = df_seg.iloc[:idx].copy()
    test  = df_seg.iloc[idx:].copy()
    """
    # simula anomalías 1% en test
    test['anomaly'] = 0
    n_anom = max(1,int(0.01*len(test)))
    ani = np.random.choice(test.index, n_anom, replace=False)
    for var in ['Presión','Temperatura','Volumen']:
        test.loc[ani, var] *= 3
    test.loc[ani,'anomaly'] = 1
    """    

    # Número de anomalías
    n_anom = max(1, int(0.01 * len(test)))
    ani = test.sample(n=n_anom, random_state=42).index

    # Etiquetar
    test['anomaly'] = 0
    test.loc[ani, 'anomaly'] = 1

    # Para cada variable, sumar ruido de gran escala
    for var in ['Presión','Temperatura','Volumen']:
        sigma = test[var].std()
        noise = np.random.normal(loc=0, scale=3*sigma, size=n_anom)
        test.loc[ani, var] += noise
    
    # A) Isolation Forest
    if_model = IsolationForest(contamination=0.015, random_state=42)
    if_model.fit(train[features])
    pred_if = if_model.predict(test[features])
    pred_if = np.where(pred_if==1, 0, 1)
    
    # B) Modelo temporal (MLPRegressor) con lags
    lag_features = [f'{v}_lag{l}' for l in lags for v in ['Presión','Temperatura','Volumen']]
    X_tr = train[lag_features]
    y_tr = train[['Presión','Temperatura','Volumen']]
    X_te = test[lag_features]
    
    temp_model = MLPRegressor(hidden_layer_sizes=(len(X_tr.columns)//2,), max_iter=200, random_state=42)
    temp_model.fit(X_tr, y_tr)
    preds = temp_model.predict(X_te)
    mse = np.mean((test[['Presión','Temperatura','Volumen']].values - preds)**2, axis=1)
    thresh = np.percentile(mse, 99)
    pred_temp = (mse > thresh).astype(int)

   
    # --------- Modelo 2: Autoencoder (reconstrucción) ---------
    ae = MLPRegressor(hidden_layer_sizes=(len(features)//2,), activation='relu',
                      max_iter=200, random_state=42)
    ae.fit(train[features], train[features])
    recon = ae.predict(test[features])
    mse_ae = np.mean((test[features] - recon)**2, axis=1)
    thresh_ae = np.percentile(mse_ae, 99)
    pred_ae = (mse_ae > thresh_ae).astype(int)


    

    # Modelo 5: LSTM 

    print("Si esta disponible")
    X_tr = train[lag_features].values.reshape(len(train), len(lags), 3)
    y_tr = train[['Presión','Temperatura','Volumen']].values
    X_te = test[lag_features].values.reshape(len(test), len(lags), 3)
    model_lstm = Sequential([
        LSTM(50, input_shape=(len(lags),3)),
        Dense(3)
    ])
    model_lstm.compile(optimizer='adam', loss='mse')
    model_lstm.fit(X_tr, y_tr, epochs=10, batch_size=32, verbose=0)
    preds_lstm = model_lstm.predict(X_te)
        
        
    # Comparar solo las 3 variables objetivo
    # mse_lstm = np.mean((test[features] - preds_lstm)**2, axis=1)
    mse_lstm = np.mean((test[['Presión','Temperatura','Volumen']].values - preds_lstm)**2, axis=1)
     
    thresh_lstm = np.percentile(mse_lstm, 99)
    pred_lstm = (mse_lstm > thresh_lstm).astype(int)

    
     # calcular métricas
    metrics = {
        'Segmento': seg,
        'N_clients': df_seg['Cliente'].nunique(),
        'Pre_IF': precision_score(test['anomaly'], pred_if, zero_division=0),
        'Recall_IF':    recall_score(test['anomaly'], pred_if, zero_division=0),
        'F1_IF':        f1_score(test['anomaly'], pred_if, zero_division=0),
        'Pre_AE':       precision_score(test['anomaly'], pred_ae, zero_division=0),
        'Recall_AE':          recall_score(test['anomaly'], pred_ae, zero_division=0),
        'F1_AE':              f1_score(test['anomaly'], pred_ae, zero_division=0),
        'Pre_Temp': precision_score(test['anomaly'], pred_temp, zero_division=0),
        'Recall_Temp':    recall_score(test['anomaly'], pred_temp, zero_division=0),
        'F1_Temp':        f1_score(test['anomaly'], pred_temp, zero_division=0),
        
               
        'Pre_LSTM':  precision_score(test['anomaly'], pred_lstm, zero_division=0),
        'Recall_LSTM':     recall_score(test['anomaly'], pred_lstm, zero_division=0),
        'F1_LSTM':         f1_score(test['anomaly'], pred_lstm, zero_division=0)
        
    }
    results.append(metrics)
   



Si esta disponible


## Mostrar resultados por segmento

In [21]:
# Mostrar resultados por segmento
metrics_df = pd.DataFrame(results)
display(metrics_df)

Unnamed: 0,Segmento,N_clients,Pre_IF,Recall_IF,F1_IF,Pre_AE,Recall_AE,F1_AE,Pre_Temp,Recall_Temp,F1_Temp,Pre_LSTM,Recall_LSTM,F1_LSTM
0,0,20,0.144918,0.335824,0.202466,0.670204,0.670468,0.670336,0.913129,0.913488,0.913308,0.912343,0.912702,0.912522
