# Notebook Final: Exportación de Modelos y Datos para la Aplicación

## 1. Importar Librerías Necesarias

In [15]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import joblib
import pickle
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import scipy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import shap
from sklearn.inspection import permutation_importance

# Para visualizar los resultados de SHAP en el notebook (opcional)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
# Save to DIR
SAVE2DIR = '../saved/'

## 2. Cargar y Preprocesar Datos

In [17]:
# Cargar los datos preprocesados
train_df = pd.read_csv('../data/train_preprocessed.csv')

In [18]:
# Verificar y manejar valores nulos
train_df = train_df.dropna()

train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text_length,text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,317,hi im isaac im going writing face mars natural...
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,210,perspective think face natural landform dont t...
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,105,think face natural landform life mars descover...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,362,life mars would know reason think natural land...
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,101,people thought face formed alieans thought lif...


In [19]:
# Codificar las etiquetas de clase
le = LabelEncoder()
y = le.fit_transform(train_df['discourse_effectiveness'])
joblib.dump(le, SAVE2DIR + 'encoders/label_encoder.pkl')  # Guardar el LabelEncoder

['../saved/encoders/label_encoder.pkl']

In [20]:
# Vectorización del texto
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(train_df['text'])
joblib.dump(tfidf, SAVE2DIR + 'encoders/tfidf_vectorizer.pkl')  # Guardar el TfidfVectorizer

['../saved/encoders/tfidf_vectorizer.pkl']

In [21]:
# Escalar la característica de longitud del texto
scaler = StandardScaler()
X_length = scaler.fit_transform(train_df[['text_length']])
joblib.dump(scaler, SAVE2DIR + 'encoders/standard_scaler.pkl')  # Guardar el StandardScaler

['../saved/encoders/standard_scaler.pkl']

In [22]:
# Codificar 'discourse_type' con OneHotEncoder
ohe = OneHotEncoder(drop='first')
X_discourse = ohe.fit_transform(train_df[['discourse_type']])
joblib.dump(ohe, SAVE2DIR + 'encoders/onehot_encoder.pkl')  # Guardar el OneHotEncoder

['../saved/encoders/onehot_encoder.pkl']

In [80]:
train_df[['discourse_type']]

Unnamed: 0,discourse_type
0,Lead
1,Position
2,Claim
3,Evidence
4,Counterclaim
...,...
36760,Claim
36761,Claim
36762,Position
36763,Evidence


In [23]:
# Concatenar características
import scipy.sparse as sp
X = sp.hstack([X_text, X_length, X_discourse])

In [24]:
# Obtener nombres de las características para futuras referencias
tfidf_features = tfidf.get_feature_names_out()
length_feature = ['text_length']
discourse_features = ohe.get_feature_names_out(['discourse_type'])
feature_names = list(tfidf_features) + length_feature + list(discourse_features)

In [79]:
ohe.get_feature_names_out(['discourse_type'])

array(['discourse_type_Concluding Statement',
       'discourse_type_Counterclaim', 'discourse_type_Evidence',
       'discourse_type_Lead', 'discourse_type_Position',
       'discourse_type_Rebuttal'], dtype=object)

In [25]:
# Dividir los datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Entrenar Modelos con Mejores Hiperparámetros

In [26]:
# Entrenar Logistic Regression
log_reg = LogisticRegression(
    multi_class='multinomial',
    max_iter=1000,
    C=1,
    solver='lbfgs',
    random_state=42
)
log_reg.fit(X_train, y_train)
joblib.dump(log_reg, SAVE2DIR + 'models/logistic_regression.pkl')

['../saved/models/logistic_regression.pkl']

In [27]:
# Entrenar XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(X_train, y_train)
joblib.dump(xgb, SAVE2DIR + 'models/xgboost.pkl')

['../saved/models/xgboost.pkl']

## 4. Entrenar Modelos de Redes Neuronales

In [28]:
# Definir parámetros comunes
num_classes = len(le.classes_)
input_dim = X_train.shape[1]

### 4.1. Modelo Keras

In [29]:
def create_keras_model(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# Crear y compilar el modelo Keras
keras_nn = create_keras_model(input_dim, num_classes)
keras_nn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Definir Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Entrenar el modelo Keras
history = keras_nn.fit(
    X_train, y_train,
    epochs=50,
    batch_size=128,
    validation_data=(X_val, y_val),
    callbacks=[early_stop],
    verbose=1
)

# Guardar el modelo Keras
keras_nn.save(SAVE2DIR + 'models/keras_nn_model.h5')

# Guardar el historial de entrenamiento
with open(SAVE2DIR + 'metrics/keras_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

Epoch 1/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.6079 - loss: 0.8853 - val_accuracy: 0.6692 - val_loss: 0.7487
Epoch 2/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.6969 - loss: 0.6878 - val_accuracy: 0.6600 - val_loss: 0.7554
Epoch 3/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.7400 - loss: 0.5986 - val_accuracy: 0.6443 - val_loss: 0.7850
Epoch 4/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.8005 - loss: 0.4911 - val_accuracy: 0.6362 - val_loss: 0.8501
Epoch 5/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.8543 - loss: 0.3828 - val_accuracy: 0.6386 - val_loss: 0.9625
Epoch 6/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.9009 - loss: 0.2772 - val_accuracy: 0.6268 - val_loss: 1.1478




### 4.2. Modelo PyTorch

In [30]:
class NeuralNetPyTorch(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NeuralNetPyTorch, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Inicializar el modelo PyTorch
pytorch_nn = NeuralNetPyTorch(input_dim, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pytorch_nn.to(device)

# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(pytorch_nn.parameters(), lr=0.001)

# Preparar datos para PyTorch
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.toarray().astype('float32')
        self.y = y.astype('int64')

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset_pytorch = TextDataset(X_train, y_train)
val_dataset_pytorch = TextDataset(X_val, y_val)

train_loader_pytorch = DataLoader(train_dataset_pytorch, batch_size=128, shuffle=True)
val_loader_pytorch = DataLoader(val_dataset_pytorch, batch_size=128, shuffle=False)

In [31]:
# Definir el dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pytorch_nn.to(device)

NeuralNetPyTorch(
  (fc1): Linear(in_features=5007, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=256, out_features=3, bias=True)
)

In [32]:
# Entrenar el modelo PyTorch
num_epochs = 50
patience = 5
best_val_loss = float('inf')
early_stop_counter = 0

train_losses_pytorch = []
val_losses_pytorch = []
val_accuracies_pytorch = []

for epoch in range(num_epochs):
    pytorch_nn.train()
    running_loss = 0.0
    for inputs, labels in train_loader_pytorch:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = pytorch_nn(inputs)
        loss = criterion(outputs, labels)

        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader_pytorch.dataset)
    train_losses_pytorch.append(epoch_loss)

    # Validación
    pytorch_nn.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader_pytorch:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = pytorch_nn(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader_pytorch.dataset)
    val_losses_pytorch.append(val_loss)
    val_accuracy = correct / total
    val_accuracies_pytorch.append(val_accuracy)

    print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {val_loss:.4f} - Val Acc: {val_accuracy:.4f}')

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        # Guardar el mejor modelo
        torch.save(pytorch_nn.state_dict(), SAVE2DIR + 'models/best_pytorch_nn_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping")
            break

Epoch 1/50 - Train Loss: 0.8236 - Val Loss: 0.7551 - Val Acc: 0.6616
Epoch 2/50 - Train Loss: 0.6970 - Val Loss: 0.7621 - Val Acc: 0.6685
Epoch 3/50 - Train Loss: 0.6317 - Val Loss: 0.7786 - Val Acc: 0.6511
Epoch 4/50 - Train Loss: 0.5615 - Val Loss: 0.8366 - Val Acc: 0.6513
Epoch 5/50 - Train Loss: 0.4772 - Val Loss: 0.8888 - Val Acc: 0.6322
Epoch 6/50 - Train Loss: 0.3836 - Val Loss: 0.9940 - Val Acc: 0.6235
Early stopping


In [33]:
# Cargar el mejor modelo PyTorch
pytorch_nn.load_state_dict(torch.load(SAVE2DIR + 'models/best_pytorch_nn_model.pth'))
pytorch_nn.eval()

NeuralNetPyTorch(
  (fc1): Linear(in_features=5007, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=256, out_features=3, bias=True)
)

In [34]:
# Guardar las métricas de PyTorch
metrics_pytorch = {
    'train_losses': train_losses_pytorch,
    'val_losses': val_losses_pytorch,
    'val_accuracies': val_accuracies_pytorch
}

with open(SAVE2DIR + 'metrics/pytorch_metrics.pkl', 'wb') as f:
    pickle.dump(metrics_pytorch, f)

## 5. Evaluación de los Modelos

In [78]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import scipy.sparse

models_performance = {}

# Function to evaluate models
def evaluate_model(name, model, X_val, y_val, is_nn=False, device='cpu'):
    if is_nn:
        if name == 'PyTorch NN':
            # Check if X_val is a sparse matrix
            if scipy.sparse.issparse(X_val):
                # Convert sparse matrix to dense
                X_val_dense = X_val.toarray()
            else:
                X_val_dense = X_val
            # Convert to PyTorch tensor
            X_val_tensor = torch.tensor(X_val_dense, dtype=torch.float32).to(device)
            with torch.no_grad():
                outputs = model(X_val_tensor)
                _, preds = torch.max(outputs, 1)
                y_pred = preds.cpu().numpy()
        else:  # Keras NN
            # Keras can handle sparse matrices if they're converted to dense
            if scipy.sparse.issparse(X_val):
                X_val_dense = X_val.toarray()
            else:
                X_val_dense = X_val
            y_pred = np.argmax(model.predict(X_val_dense), axis=1)
    else:
        y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, target_names=le.classes_, output_dict=True)
    cm = confusion_matrix(y_val, y_pred)

    performance = {
        'accuracy': acc,
        'classification_report': report,
        'confusion_matrix': cm
    }

    models_performance[name] = performance

    # Add feature importances if applicable
    if name in ['Random Forest', 'XGBoost']:
        feature_importances = [{'feature': feature, 'importance': importance}
                               for feature, importance in zip(feature_names, model.feature_importances_)]
        # Save to a csv file
        pd.DataFrame(feature_importances).to_csv(SAVE2DIR + f'metrics/{name}_feature_importances.csv', index=False)
        # performance['feature_importances'] = feature_importances

    elif name == 'Logistic Regression':
        # Logistic Regression feature importances are based on coefficients
        feature_importances = [{'feature': feature, 'importance': coef}
                               for feature, coef in zip(feature_names, model.coef_[0])]
        # Save to a csv file
        pd.DataFrame(feature_importances).to_csv(SAVE2DIR + 'metrics/logistic_regression_feature_importances.csv', index=False)

# Example evaluations
# Evaluate Logistic Regression
evaluate_model('Logistic Regression', log_reg, X_val, y_val)

# Evaluate XGBoost
evaluate_model('XGBoost', xgb, X_val, y_val)

# Evaluate Keras NN
evaluate_model('Keras NN', keras_nn, X_val, y_val, is_nn=True)

# Evaluate PyTorch NN
evaluate_model('PyTorch NN', pytorch_nn, X_val, y_val, is_nn=True, device=device)

# Save model performance metrics
with open(SAVE2DIR + 'metrics/models_performance.pkl', 'wb') as f:
    pickle.dump(models_performance, f)


[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


## 7. Extraer y Guardar Importancias de Características

### 7.1. Importancias de XGBoost

In [36]:
# Guardar importancias de XGBoost
if 'XGBoost' in models_performance:
    xgb_importances = models_performance['XGBoost']['feature_importances']
    xgb_importances_df = pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    }).sort_values(by='importance', ascending=False)
    xgb_importances_df.to_csv(SAVE2DIR + 'metrics/xgboost_feature_importances.csv', index=False)

### 7.2 Importancia de las características SHAP

#### 7.2.1 Importancia de las características para Keras NN

In [76]:
print("Calculando SHAP para el modelo Keras NN...")

# Convertir X_val a denso si es necesario
if scipy.sparse.issparse(X_val):
    X_val_dense = X_val.toarray()
else:
    X_val_dense = X_val

# Seleccionar una muestra para acelerar el cálculo de SHAP (opcional)
shap_sample_size = 10  # Puedes ajustar este número según tus recursos
X_shap = X_val_dense[:shap_sample_size]

# Crear el explainer usando DeepExplainer
explainer_keras = shap.DeepExplainer(keras_nn, X_shap)

# Calcular los valores SHAP
shap_values_keras = explainer_keras.shap_values(X_shap)

# Agregar los valores SHAP para todas las clases
# Para clasificación multiclase, SHAP devuelve una lista de arrays, uno por clase
if isinstance(shap_values_keras, list):
    # Cada elemento de la lista corresponde a una clase
    print(f"GG Number of classes: {len(shap_values_keras)}")
    shap_importances_keras = np.array([
        np.abs(shap_val).mean(axis=0) for shap_val in shap_values_keras
    ])  # Shape: (num_classes, num_features)
else:
    shap_importances_keras = np.abs(shap_values_keras).mean(axis=0)  # Shape: (num_features,)

# Verificar las longitudes
print(f"Length of feature_names: {len(feature_names)}")
print(f"Shape of shap_importances_keras: {shap_importances_keras.shape}")

# Alinear las longitudes si es necesario
if len(feature_names) != shap_importances_keras.shape[-1]:
    print("Mismatch detected between feature names and SHAP importances for Keras NN.")
    # Investigar la causa del desajuste
    # Por ahora, truncamos ambas listas al tamaño mínimo
    min_length = min(len(feature_names), shap_importances_keras.shape[-1])
    feature_names_aligned = feature_names
    shap_importances_keras = shap_importances_keras[:, :min_length] if isinstance(shap_importances_keras, np.ndarray) else shap_importances_keras[:min_length]
else:
    feature_names_aligned = feature_names

# Verificar nuevamente las dimensiones después de la alineación
print(f"Length of feature_names_aligned: {len(feature_names_aligned)}")
if isinstance(shap_importances_keras, np.ndarray):
    print(f"Shape of shap_importances_keras after alignment: {shap_importances_keras.shape}")
else:
    print(f"Length of shap_importances_keras after alignment: {len(shap_importances_keras)}")

print("Guardando importancias de características SHAP para Keras NN...")

# Crear un DataFrame con las importancias por clase
shap_importances_df_keras = pd.DataFrame({
    'feature': feature_names_aligned
})

print(shap_importances_keras[1000])

# Añadir una columna por cada clase
for class_idx, class_name in enumerate(le.classes_):
    print(f"Class: {class_name}")
    print(f"Class index: {class_idx}")

    # for each element in shap_importances_keras, get the class_idx element
    class_importance = []
    for i in range(len(shap_importances_keras)):
        class_importance.append(shap_importances_keras[i][class_idx])

    shap_importances_df_keras[f'shap_importance_class_{class_name}'] = class_importance

# Ordenar el DataFrame según la importancia de la primera clase (puedes elegir otra clase o usar un promedio)
shap_importances_df_keras = shap_importances_df_keras.sort_values(by=f'shap_importance_class_{le.classes_[0]}', ascending=False)

# Guardar las importancias de SHAP para Keras
shap_importances_df_keras.to_csv(SAVE2DIR + 'metrics/keras_nn_shap_importances.csv', index=False)

print("Importancias de características SHAP para Keras NN guardadas en 'metrics/keras_nn_shap_importances.csv'.")

Calculando SHAP para el modelo Keras NN...
Length of feature_names: 5007
Shape of shap_importances_keras: (5007, 3)
Mismatch detected between feature names and SHAP importances for Keras NN.
Length of feature_names_aligned: 5007
Shape of shap_importances_keras after alignment: (5007, 3)
Guardando importancias de características SHAP para Keras NN...
[0. 0. 0.]
Class: Adequate
Class index: 0
Class: Effective
Class index: 1
Class: Ineffective
Class index: 2
Importancias de características SHAP para Keras NN guardadas en 'metrics/keras_nn_shap_importances.csv'.


#### 7.2.2 Importancia de las características para PyTorch NN

In [77]:
print("Calculando SHAP para el modelo PyTorch NN...")

# Definir una función de predicción para PyTorch
def pytorch_predict(x):
    pytorch_nn.eval()
    with torch.no_grad():
        inputs = torch.tensor(x, dtype=torch.float32).to(device)
        outputs = pytorch_nn(inputs)
        probs = torch.softmax(outputs, dim=1).cpu().numpy()
    return probs

# Crear el explainer usando KernelExplainer
explainer_pytorch = shap.KernelExplainer(pytorch_predict, X_shap)

# Calcular los valores SHAP
shap_values_pytorch = explainer_pytorch.shap_values(X_shap, nsamples=100)

# Agregar los valores SHAP para todas las clases
if isinstance(shap_values_pytorch, list):
    shap_importances_pytorch = np.array([
        np.abs(shap_val).mean(axis=0) for shap_val in shap_values_pytorch
    ])  # Shape: (num_classes, num_features)
else:
    shap_importances_pytorch = np.abs(shap_values_pytorch).mean(axis=0)  # Shape: (num_features,)

# Verificar las longitudes
print(f"Length of feature_names: {len(feature_names)}")
print(f"Shape of shap_importances_pytorch: {shap_importances_pytorch.shape}")

# Alinear las longitudes si es necesario
if len(feature_names) != shap_importances_pytorch.shape[-1]:
    print("Mismatch detected between feature names and SHAP importances for PyTorch NN.")
    # Investigar la causa del desajuste
    # Por ahora, truncamos ambas listas al tamaño mínimo
    min_length = min(len(feature_names), shap_importances_pytorch.shape[-1])
    feature_names_aligned_pytorch = feature_names
    shap_importances_pytorch = shap_importances_pytorch[:, :min_length] if isinstance(shap_importances_pytorch, np.ndarray) else shap_importances_pytorch[:min_length]
else:
    feature_names_aligned_pytorch = feature_names

# Verificar nuevamente las dimensiones después de la alineación
print(f"Length of feature_names_aligned_pytorch: {len(feature_names_aligned_pytorch)}")
if isinstance(shap_importances_pytorch, np.ndarray):
    print(f"Shape of shap_importances_pytorch after alignment: {shap_importances_pytorch.shape}")
else:
    print(f"Length of shap_importances_pytorch after alignment: {len(shap_importances_pytorch)}")

print("Guardando importancias de características SHAP para PyTorch NN...")

# Crear un DataFrame con las importancias por clase
shap_importances_df_pytorch = pd.DataFrame({
    'feature': feature_names_aligned_pytorch
})

# Añadir una columna por cada clase
for class_idx, class_name in enumerate(le.classes_):
    class_importance = []
    for i in range(len(shap_importances_pytorch)):
        class_importance.append(shap_importances_pytorch[i][class_idx])

    shap_importances_df_pytorch[f'shap_importance_class_{class_name}'] = class_importance

# Ordenar el DataFrame según la importancia de la primera clase (puedes elegir otra clase o usar un promedio)
shap_importances_df_pytorch = shap_importances_df_pytorch.sort_values(by=f'shap_importance_class_{le.classes_[0]}', ascending=False)

# Guardar las importancias de SHAP para PyTorch
shap_importances_df_pytorch.to_csv(SAVE2DIR + 'metrics/pytorch_nn_shap_importances.csv', index=False)

print("Importancias de características SHAP para PyTorch NN guardadas en 'metrics/pytorch_nn_shap_importances.csv'.")

Calculando SHAP para el modelo PyTorch NN...


  0%|          | 0/10 [00:00<?, ?it/s]

Length of feature_names: 5007
Shape of shap_importances_pytorch: (5007, 3)
Mismatch detected between feature names and SHAP importances for PyTorch NN.
Length of feature_names_aligned_pytorch: 5007
Shape of shap_importances_pytorch after alignment: (5007, 3)
Guardando importancias de características SHAP para PyTorch NN...
Importancias de características SHAP para PyTorch NN guardadas en 'metrics/pytorch_nn_shap_importances.csv'.


## 8. Guardar Métricas Adicionales para Visualizaciones

In [39]:
# 9.1. Guardar las matrices de confusión como datos
for model_name, metrics in models_performance.items():
    cm = metrics['confusion_matrix']
    cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    cm_df.to_csv(SAVE2DIR + f'metrics/confusion_matrix_{model_name.replace(" ", "_").lower()}.csv')

# 9.2. Guardar las precisiones de los modelos
accuracy_df = pd.DataFrame({
    'Model': list(models_performance.keys()),
    'Accuracy': [metrics['accuracy'] for metrics in models_performance.values()]
}).sort_values(by='Accuracy', ascending=False)

accuracy_df.to_csv(SAVE2DIR +'metrics/model_accuracies.csv', index=False)

# 9.3. Guardar informes de clasificación
for model_name, metrics in models_performance.items():
    report = metrics['classification_report']
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(SAVE2DIR + f'metrics/classification_report_{model_name.replace(" ", "_").lower()}.csv')

# 10. Guardar Información Adicional del Dataset para Gráficas

# 10.1. Estadísticas de Longitud de Texto
text_length_stats = train_df['text_length'].describe().to_frame().T
text_length_stats.to_csv(SAVE2DIR + 'metrics/text_length_stats.csv', index=False)

# 10.2. Distribución de Tipos de Discurso
discourse_type_counts = train_df['discourse_type'].value_counts().reset_index()
discourse_type_counts.columns = ['discourse_type', 'count']
discourse_type_counts.to_csv(SAVE2DIR + 'metrics/discourse_type_counts.csv', index=False)

# 10.3. Distribución de Efectividad del Discurso
effectiveness_counts = train_df['discourse_effectiveness'].value_counts().reset_index()
effectiveness_counts.columns = ['discourse_effectiveness', 'count']
effectiveness_counts.to_csv(SAVE2DIR + 'metrics/discourse_effectiveness_counts.csv', index=False)

# 10.4. Guardar el Dataset Procesado (Opcional)
train_df.to_csv(SAVE2DIR + 'metrics/processed_train_dataset.csv', index=False)

print("¡Todos los modelos, codificadores y métricas han sido guardados correctamente!")

¡Todos los modelos, codificadores y métricas han sido guardados correctamente!
