# Notebook Final: Exportación de Modelos y Datos para la Aplicación

## 1. Importar Librerías Necesarias

In [30]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import joblib
import pickle
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [31]:
# Save to DIR
SAVE2DIR = '../saved/'

## 2. Cargar y Preprocesar Datos

In [32]:
# Cargar los datos preprocesados
train_df = pd.read_csv('../data/train_preprocessed.csv')

In [33]:
# Verificar y manejar valores nulos
train_df = train_df.dropna()

In [34]:
# Codificar las etiquetas de clase
le = LabelEncoder()
y = le.fit_transform(train_df['discourse_effectiveness'])
joblib.dump(le, SAVE2DIR + 'encoders/label_encoder.pkl')  # Guardar el LabelEncoder

['../saved/encoders/label_encoder.pkl']

In [35]:
# Vectorización del texto
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(train_df['text'])
joblib.dump(tfidf, SAVE2DIR + 'encoders/tfidf_vectorizer.pkl')  # Guardar el TfidfVectorizer

['../saved/encoders/tfidf_vectorizer.pkl']

In [36]:
# Escalar la característica de longitud del texto
scaler = StandardScaler()
X_length = scaler.fit_transform(train_df[['text_length']])
joblib.dump(scaler, SAVE2DIR + 'encoders/standard_scaler.pkl')  # Guardar el StandardScaler

['../saved/encoders/standard_scaler.pkl']

In [37]:
# Codificar 'discourse_type' con OneHotEncoder
ohe = OneHotEncoder(drop='first')
X_discourse = ohe.fit_transform(train_df[['discourse_type']])
joblib.dump(ohe, SAVE2DIR + 'encoders/onehot_encoder.pkl')  # Guardar el OneHotEncoder

['../saved/encoders/onehot_encoder.pkl']

In [38]:
# Concatenar características
import scipy.sparse as sp
X = sp.hstack([X_text, X_length, X_discourse])

In [39]:
# Obtener nombres de las características para futuras referencias
tfidf_features = tfidf.get_feature_names_out()
length_feature = ['text_length']
discourse_features = ohe.get_feature_names_out(['discourse_type'])
feature_names = list(tfidf_features) + length_feature + list(discourse_features)

In [40]:
# Dividir los datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Entrenar Modelos con Mejores Hiperparámetros

In [20]:
# Entrenar Logistic Regression
log_reg = LogisticRegression(
    multi_class='multinomial',
    max_iter=1000,
    C=1,
    solver='lbfgs',
    random_state=42
)
log_reg.fit(X_train, y_train)
joblib.dump(log_reg, SAVE2DIR + 'models/logistic_regression.pkl')

['../saved/models/logistic_regression.pkl']

In [25]:
# # Entrenar Random Forest
# rand_forest = RandomForestClassifier(
#     n_estimators=200,
#     max_depth=None,
#     min_samples_split=5,
#     random_state=42
# )
# rand_forest.fit(X_train, y_train)
# joblib.dump(rand_forest, SAVE2DIR + 'models/random_forest.pkl')

In [24]:
# Entrenar XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(X_train, y_train)
joblib.dump(xgb, SAVE2DIR + 'models/xgboost.pkl')

['../saved/models/xgboost.pkl']

## 4. Entrenar Modelos de Redes Neuronales

In [26]:
# Definir parámetros comunes
num_classes = len(le.classes_)
input_dim = X_train.shape[1]

### 4.1. Modelo Keras

In [28]:
def create_keras_model(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# Crear y compilar el modelo Keras
keras_nn = create_keras_model(input_dim, num_classes)
keras_nn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Definir Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Entrenar el modelo Keras
history = keras_nn.fit(
    X_train, y_train,
    epochs=50,
    batch_size=128,
    validation_data=(X_val, y_val),
    callbacks=[early_stop],
    verbose=1
)

# Guardar el modelo Keras
keras_nn.save(SAVE2DIR + 'models/keras_nn_model.h5')

# Guardar el historial de entrenamiento
with open(SAVE2DIR + 'metrics/keras_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

Epoch 1/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - accuracy: 0.6068 - loss: 0.8889 - val_accuracy: 0.6658 - val_loss: 0.7517
Epoch 2/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.6959 - loss: 0.6817 - val_accuracy: 0.6688 - val_loss: 0.7577
Epoch 3/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.7390 - loss: 0.5900 - val_accuracy: 0.6479 - val_loss: 0.7841
Epoch 4/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.7981 - loss: 0.4910 - val_accuracy: 0.6451 - val_loss: 0.8509
Epoch 5/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.8605 - loss: 0.3730 - val_accuracy: 0.6291 - val_loss: 0.9758
Epoch 6/50
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.9019 - loss: 0.2728 - val_accuracy: 0.6254 - val_loss: 1.1249




### 4.2. Modelo PyTorch

In [41]:
class NeuralNetPyTorch(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NeuralNetPyTorch, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Inicializar el modelo PyTorch
pytorch_nn = NeuralNetPyTorch(input_dim, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pytorch_nn.to(device)

# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(pytorch_nn.parameters(), lr=0.001)

# Preparar datos para PyTorch
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.toarray().astype('float32')
        self.y = y.astype('int64')

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset_pytorch = TextDataset(X_train, y_train)
val_dataset_pytorch = TextDataset(X_val, y_val)

train_loader_pytorch = DataLoader(train_dataset_pytorch, batch_size=128, shuffle=True)
val_loader_pytorch = DataLoader(val_dataset_pytorch, batch_size=128, shuffle=False)

In [43]:
# Definir el dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pytorch_nn.to(device)

NeuralNetPyTorch(
  (fc1): Linear(in_features=5007, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=256, out_features=3, bias=True)
)

In [48]:
# Entrenar el modelo PyTorch
num_epochs = 50
patience = 5
best_val_loss = float('inf')
early_stop_counter = 0

train_losses_pytorch = []
val_losses_pytorch = []
val_accuracies_pytorch = []

for epoch in range(num_epochs):
    pytorch_nn.train()
    running_loss = 0.0
    for inputs, labels in train_loader_pytorch:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = pytorch_nn(inputs)
        loss = criterion(outputs, labels)

        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader_pytorch.dataset)
    train_losses_pytorch.append(epoch_loss)

    # Validación
    pytorch_nn.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader_pytorch:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = pytorch_nn(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader_pytorch.dataset)
    val_losses_pytorch.append(val_loss)
    val_accuracy = correct / total
    val_accuracies_pytorch.append(val_accuracy)

    print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {val_loss:.4f} - Val Acc: {val_accuracy:.4f}')

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        # Guardar el mejor modelo
        torch.save(pytorch_nn.state_dict(), SAVE2DIR + 'models/best_pytorch_nn_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping")
            break

Epoch 1/50 - Train Loss: 0.2074 - Val Loss: 1.3095 - Val Acc: 0.6099
Epoch 2/50 - Train Loss: 0.1564 - Val Loss: 1.4840 - Val Acc: 0.6206
Epoch 3/50 - Train Loss: 0.1296 - Val Loss: 1.5899 - Val Acc: 0.6126
Epoch 4/50 - Train Loss: 0.1086 - Val Loss: 1.6893 - Val Acc: 0.6208
Epoch 5/50 - Train Loss: 0.0922 - Val Loss: 1.8065 - Val Acc: 0.6147
Epoch 6/50 - Train Loss: 0.0811 - Val Loss: 1.8261 - Val Acc: 0.6117
Early stopping


In [50]:
# Cargar el mejor modelo PyTorch
pytorch_nn.load_state_dict(torch.load(SAVE2DIR + 'models/best_pytorch_nn_model.pth'))
pytorch_nn.eval()

NeuralNetPyTorch(
  (fc1): Linear(in_features=5007, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=256, out_features=3, bias=True)
)

In [51]:
# Guardar las métricas de PyTorch
metrics_pytorch = {
    'train_losses': train_losses_pytorch,
    'val_losses': val_losses_pytorch,
    'val_accuracies': val_accuracies_pytorch
}

with open(SAVE2DIR + 'metrics/pytorch_metrics.pkl', 'wb') as f:
    pickle.dump(metrics_pytorch, f)

## 5. Evaluación de los Modelos