In [115]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [116]:
# --- 1. Cargar el archivo CSV original ---
input_file = "Datasets/diabetes_012_health_indicators_BRFSS2015.csv"
data = pd.read_csv(input_file)

In [117]:
# --- 2. Preprocesar los datos ---
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Guardar los rangos originales para desescalar después
column_ranges = {
    col: (data[col].min(), data[col].max()) for col in data.columns
}

In [118]:
# --- Dividir los datos en entrenamiento y validación ---
train_data, val_data = train_test_split(data_scaled, test_size=0.2, random_state=42)
# Convertir los datos a tensores de PyTorch
train_tensor = torch.tensor(train_data, dtype=torch.float32)
val_tensor = torch.tensor(val_data, dtype=torch.float32)

# Índices de las columnas importantes
bmi_index = data.columns.get_loc("BMI")
menthlth_index = data.columns.get_loc("MentHlth")
physhlth_index = data.columns.get_loc("PhysHlth")

In [120]:
# --- 3. Definir el modelo GAN ---
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)
    
class Discriminator(nn.Module):
    def __init__(self, input_dim):  # Corrección: falta la coma
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [121]:
# Dimensiones
latent_dim = 100
data_dim = train_tensor.shape[1]

In [122]:
# Instanciar el generador y el discriminador
generator = Generator(latent_dim, data_dim)
discriminator = Discriminator(data_dim)

In [123]:
# --- 4. Configurar el entrenamiento ---
criterion = nn.BCELoss()
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0001)
optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
epochs = 10000
batch_size = 64

In [124]:
g_losses_train = []
d_losses_train = []

for epoch in range(epochs):
    # Entrenar el discriminador
    real_data = train_tensor[torch.randint(0, train_tensor.size(0), (batch_size,))]
    real_labels = torch.ones((batch_size, 1))
    fake_latent = torch.randn((batch_size, latent_dim))
    fake_data = generator(fake_latent)
    fake_labels = torch.zeros((batch_size, 1))

    optimizer_d.zero_grad()
    real_loss = criterion(discriminator(real_data), real_labels)
    fake_loss = criterion(discriminator(fake_data.detach()), fake_labels)
    d_loss = real_loss + fake_loss
    d_loss.backward()
    optimizer_d.step()

    # Entrenar el generador
    optimizer_g.zero_grad()
    fake_labels = torch.ones((batch_size, 1))
    g_loss = criterion(discriminator(fake_data), fake_labels)
    g_loss.backward()
    optimizer_g.step()

    # Guardar las pérdidas de entrenamiento
    g_losses_train.append(g_loss.item())
    d_losses_train.append(d_loss.item())

    # Mostrar progreso
    if epoch % 500 == 0:
        print(f"Epoch {epoch}/{epochs} | D Loss: {d_loss.item()} | G Loss: {g_loss.item()}")

Epoch 0/10000 | D Loss: 1.3652408123016357 | G Loss: 0.7153164148330688
Epoch 500/10000 | D Loss: 1.0837342739105225 | G Loss: 0.914198637008667
Epoch 1000/10000 | D Loss: 1.25753915309906 | G Loss: 0.6913037300109863
Epoch 1500/10000 | D Loss: 1.362099289894104 | G Loss: 0.8201712369918823
Epoch 2000/10000 | D Loss: 1.3366727828979492 | G Loss: 0.826366126537323
Epoch 2500/10000 | D Loss: 1.25968337059021 | G Loss: 0.9821163415908813
Epoch 3000/10000 | D Loss: 1.487748146057129 | G Loss: 0.6618410348892212
Epoch 3500/10000 | D Loss: 1.5428130626678467 | G Loss: 0.8575720191001892
Epoch 4000/10000 | D Loss: 0.19278669357299805 | G Loss: 2.979649543762207
Epoch 4500/10000 | D Loss: 0.04733400046825409 | G Loss: 4.327676296234131
Epoch 5000/10000 | D Loss: 0.04631864279508591 | G Loss: 4.462451934814453
Epoch 5500/10000 | D Loss: 0.08297702670097351 | G Loss: 3.310225486755371
Epoch 6000/10000 | D Loss: 0.2304740995168686 | G Loss: 3.860424280166626
Epoch 6500/10000 | D Loss: 0.013888014

In [126]:
# --- Generar nuevos datos ---
num_samples = 1000
latent_samples = torch.randn((num_samples, latent_dim))
generated_data = generator(latent_samples).detach().numpy()

# Desescalar cada columna según su rango original
for i, col in enumerate(data.columns):
    min_val, max_val = column_ranges[col]
    generated_data[:, i] = generated_data[:, i] * (max_val - min_val) + min_val

# Aplicar la función techo para redondear los datos a enteros
generated_data = np.ceil(generated_data)

# Limitar los valores generados al rango válido para Diabetes_012
generated_data[:, 0] = np.clip(generated_data[:, 0], 0, 2)  # Columna Diabetes_012

# Guardar los datos generados en un nuevo archivo CSV
output_file = "Datasets/generated_data2.csv"
pd.DataFrame(generated_data, columns=data.columns).to_csv(output_file, index=False)
print(f"Nuevos datos generados y guardados en: {output_file}")

Nuevos datos generados y guardados en: Datasets/generated_data2.csv
