# Baseline Model: Logistic Regression

In [77]:

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, make_scorer


In [78]:
if not os.path.basename(os.getcwd()) == 'Seismic-Multilabel-Event-Classifier':
    os.chdir('..')
    print(f"Changed directory to {os.getcwd()}")

In [79]:

# Cargar dataset procesado
data_path = 'data/processed/dataset_final.json'
df = pd.read_json(data_path)
print('Dataset shape:', df.shape)
df.head()


Dataset shape: (1674, 91)


Unnamed: 0,max_V,rms_V,max_H1,rms_H1,max_H2,rms_H2,duration,zcr_V,dom_freq_V,centroid_V,...,FFT_PCA_62,1 Stiker Slip (SS),2 Normal-Oblique (SO),3 Reverse-Oblique (RO),4-6,6-8,0-200,200-400,400-600,600-
0,0.000342,5.2e-05,0.000587,0.000111,0.000617,0.000107,321.68,2.014362,0.0,5.447156,...,0.015319,True,False,False,True,False,False,False,False,True
1,0.003967,0.000347,0.006399,0.000501,0.004988,0.000475,200.0,9.034548,0.78125,17.55107,...,0.015314,True,False,False,True,False,False,False,False,True
2,0.000135,2.3e-05,0.000204,2.8e-05,0.000157,2.6e-05,323.79,1.429895,0.195312,2.573894,...,0.015312,True,False,False,True,False,False,False,False,True
3,0.003166,0.000453,0.00503,0.000657,0.004669,0.000725,41.0,21.189954,14.0625,14.019404,...,0.001835,True,False,False,True,False,False,False,False,True
4,1.1e-05,2e-06,2.1e-05,3e-06,1.8e-05,3e-06,180.8,3.943366,0.195312,10.936094,...,0.015316,True,False,False,True,False,False,False,False,True


In [80]:
# Primero, definimos qué columnas son las etiquetas one-hot
label_cols = [
    '1 Stiker Slip (SS)', '2 Normal-Oblique (SO)', '3 Reverse-Oblique (RO)',
    '4-6', '6-8', '0-200', '200-400', '400-600', '600-'
]

# Columnas que queremos eliminar de X porque no son features numéricas
drop_cols = ['Archivo', 'Falla_lbl', 'Mag_lbl', 'Vs_lbl', 'Combo'] + label_cols

# Ahora sí sacamos sólo las columnas numéricas para X
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols].values
y = df[label_cols].values

print('Features shape:', X.shape)
print('Labels shape:', y.shape)
print('Label columns:', label_cols)

Features shape: (1674, 82)
Labels shape: (1674, 9)
Label columns: ['1 Stiker Slip (SS)', '2 Normal-Oblique (SO)', '3 Reverse-Oblique (RO)', '4-6', '6-8', '0-200', '200-400', '400-600', '600-']


In [81]:

# División Train(60%)/Val(20%)/Test(20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

print('Train:', X_train.shape, y_train.shape)
print('Validation:', X_val.shape, y_val.shape)
print('Test:', X_test.shape, y_test.shape)


Train: (1004, 82) (1004, 9)
Validation: (335, 82) (335, 9)
Test: (335, 82) (335, 9)


In [82]:

# Pipeline de escalado y LogisticRegression multilabel
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='lbfgs')))
])


In [83]:

# Definir grid de hiperparámetros
param_grid = {
    'clf__estimator__C': [0.01, 0.1, 1, 10],
    'clf__estimator__penalty': ['l2'],
    'clf__estimator__solver': ['lbfgs'],
}

print('Param grid:', param_grid)


Param grid: {'clf__estimator__C': [0.01, 0.1, 1, 10], 'clf__estimator__penalty': ['l2'], 'clf__estimator__solver': ['lbfgs']}


In [84]:

# Configurar GridSearchCV con KFold sobre entrenamiento
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, average='micro')

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Ejecutar búsqueda
grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
print('Best CV score (micro f1):', grid.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END clf__estimator__C=0.01, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.01, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.01, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.01, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=0.01, clf__estimator__penalty=l2, clf__estimator__solver=lbfgs; total time=   0.0s
[CV] END clf__estimator__C=1, clf__estimator__penalty=l2, clf__

In [85]:

# Evaluar mejor modelo en validation
best_model = grid.best_estimator_
y_val_pred = best_model.predict(X_val)

print('Validation F1 Micro:', f1_score(y_val, y_val_pred, average='micro'))
print('Validation F1 Macro:', f1_score(y_val, y_val_pred, average='macro'))
print('\nClassification Report on Validation:\n')
print(classification_report(y_val, y_val_pred, target_names=label_cols, zero_division=0))


Validation F1 Micro: 0.5331820760068066
Validation F1 Macro: 0.4579893932981597

Classification Report on Validation:

                        precision    recall  f1-score   support

    1 Stiker Slip (SS)       0.44      0.23      0.30       119
 2 Normal-Oblique (SO)       0.41      0.22      0.29        77
3 Reverse-Oblique (RO)       0.54      0.46      0.50       139
                   4-6       0.80      0.76      0.78       188
                   6-8       0.71      0.76      0.73       147
                 0-200       0.44      0.18      0.26        38
               200-400       0.55      0.30      0.38        98
               400-600       0.46      0.28      0.35       107
                  600-       0.64      0.47      0.54        92

             micro avg       0.62      0.47      0.53      1005
             macro avg       0.55      0.41      0.46      1005
          weighted avg       0.59      0.47      0.51      1005
           samples avg       0.66      0.47    

In [86]:

# Reentrenar en train+val
best_model.fit(np.vstack([X_train, X_val]), np.vstack([y_train, y_val]))
y_test_pred = best_model.predict(X_test)

print('Test F1 Micro:', f1_score(y_test, y_test_pred, average='micro'))
print('Test F1 Macro:', f1_score(y_test, y_test_pred, average='macro'))
print('\nClassification Report on Test:\n')
print(classification_report(y_test, y_test_pred, target_names=label_cols, zero_division=0))


Test F1 Micro: 0.5499412455934195
Test F1 Macro: 0.45148476037016305

Classification Report on Test:

                        precision    recall  f1-score   support

    1 Stiker Slip (SS)       0.61      0.18      0.28       119
 2 Normal-Oblique (SO)       0.50      0.18      0.27        77
3 Reverse-Oblique (RO)       0.61      0.45      0.52       139
                   4-6       0.80      0.82      0.81       188
                   6-8       0.77      0.73      0.75       147
                 0-200       0.44      0.11      0.17        38
               200-400       0.47      0.29      0.36        97
               400-600       0.52      0.25      0.34       107
                  600-       0.64      0.51      0.57        93

             micro avg       0.67      0.47      0.55      1005
             macro avg       0.60      0.39      0.45      1005
          weighted avg       0.63      0.47      0.52      1005
           samples avg       0.69      0.47      0.53      1005


In [87]:

# Guardar el mejor modelo
import joblib
os.makedirs('models', exist_ok=True)
joblib.dump(best_model, 'models/logreg_best_hyperparams.pkl')
print('Modelo guardado en models/logreg_best_hyperparams.pkl')


Modelo guardado en models/logreg_best_hyperparams.pkl
