# Notebook Final: Exportación de Modelos y Datos para la Aplicación

## 1. Importar Librerías Necesarias

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import joblib
import pickle
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Save to DIR
SAVE2DIR = '../saved/'

## 2. Cargar y Preprocesar Datos

In [3]:
# Cargar los datos preprocesados
train_df = pd.read_csv('../data/train_preprocessed.csv')

In [4]:
# Verificar y manejar valores nulos
train_df = train_df.dropna()

In [6]:
# Codificar las etiquetas de clase
le = LabelEncoder()
y = le.fit_transform(train_df['discourse_effectiveness'])
joblib.dump(le, SAVE2DIR + 'encoders/label_encoder.pkl')  # Guardar el LabelEncoder

['../saved/encoders/label_encoder.pkl']

In [15]:
# Vectorización del texto
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(train_df['text'])
joblib.dump(tfidf, SAVE2DIR + 'encoders/tfidf_vectorizer.pkl')  # Guardar el TfidfVectorizer

['../saved/encoders/tfidf_vectorizer.pkl']

In [8]:
# Escalar la característica de longitud del texto
scaler = StandardScaler()
X_length = scaler.fit_transform(train_df[['text_length']])
joblib.dump(scaler, SAVE2DIR + 'encoders/standard_scaler.pkl')  # Guardar el StandardScaler

['../saved/encoders/standard_scaler.pkl']

In [11]:
# Codificar 'discourse_type' con OneHotEncoder
ohe = OneHotEncoder(drop='first')
X_discourse = ohe.fit_transform(train_df[['discourse_type']])
joblib.dump(ohe, SAVE2DIR + 'encoders/onehot_encoder.pkl')  # Guardar el OneHotEncoder

['../saved/encoders/onehot_encoder.pkl']

In [16]:
# Concatenar características
import scipy.sparse as sp
X = sp.hstack([X_text, X_length, X_discourse])

In [17]:
# Obtener nombres de las características para futuras referencias
tfidf_features = tfidf.get_feature_names_out()
length_feature = ['text_length']
discourse_features = ohe.get_feature_names_out(['discourse_type'])
feature_names = list(tfidf_features) + length_feature + list(discourse_features)

In [18]:
# Dividir los datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Entrenar Modelos con Mejores Hiperparámetros

In [20]:
# Entrenar Logistic Regression
log_reg = LogisticRegression(
    multi_class='multinomial',
    max_iter=1000,
    C=1,
    solver='lbfgs',
    random_state=42
)
log_reg.fit(X_train, y_train)
joblib.dump(log_reg, SAVE2DIR + 'models/logistic_regression.pkl')

['../saved/models/logistic_regression.pkl']

In [25]:
# # Entrenar Random Forest
# rand_forest = RandomForestClassifier(
#     n_estimators=200,
#     max_depth=None,
#     min_samples_split=5,
#     random_state=42
# )
# rand_forest.fit(X_train, y_train)
# joblib.dump(rand_forest, SAVE2DIR + 'models/random_forest.pkl')

In [24]:
# Entrenar XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(X_train, y_train)
joblib.dump(xgb, SAVE2DIR + 'models/xgboost.pkl')

['../saved/models/xgboost.pkl']