In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

2024-02-25 13:18:09.064989: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-25 13:18:09.100011: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 13:18:09.100039: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 13:18:09.100978: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-25 13:18:09.106090: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-25 13:18:09.106410: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
# Configuración de TPU (Opcional)
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    strategy = tf.distribute.get_strategy()  # CPU or single GPU
    print('Running on', strategy.num_replicas_in_sync, 'Replica(s)')

Running on 1 Replica(s)


In [3]:
# Cargando los datos
data = './Data/Imputed_DF_DTR.csv'  # Asegúrate de que la ruta al archivo es correcta
df = pd.read_csv(data)

In [4]:
# Crea una copia de seguridad del DataFrame original
df_original = df.copy()

In [5]:
# Función para eliminar outliers basados en el IQR de todas las columnas numéricas
def eliminar_outliers(df):
    for columna in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[columna].quantile(0.25)
        Q3 = df[columna].quantile(0.75)
        IQR = Q3 - Q1
        limite_inferior = Q1 - 1.5 * IQR
        limite_superior = Q3 + 1.5 * IQR

        df = df[(df[columna] >= limite_inferior) & (df[columna] <= limite_superior)]
        
    return df

In [6]:
# Elimina outliers en todas las columnas numéricas utilizando la copia del DataFrame
df_limpio = eliminar_outliers(df_original.copy())

In [7]:
# Muestra el tamaño del DataFrame antes y después de la eliminación de outliers
print("Tamaño original del DataFrame:", df_original.shape)
print("Tamaño del DataFrame después de eliminar outliers:", df_limpio.shape)

Tamaño original del DataFrame: (1197994, 38)
Tamaño del DataFrame después de eliminar outliers: (295258, 38)


In [8]:
# Paso 1: Manejo de valores 0 o negativos antes de aplicar log
df_limpio = np.log1p(df_limpio.select_dtypes(include=['float64', 'int64']))

In [9]:
# Añadir de nuevo las columnas no numéricas (si las hay)
df_limpio = df_limpio.join(df_original.select_dtypes(exclude=['float64', 'int64']))

In [10]:
# Paso 2: Verificación y limpieza de valores infinitos o NaN después de aplicar log
df_limpio.replace([np.inf, -np.inf], np.nan, inplace=True)
df_limpio.fillna(df_limpio.mean(), inplace=True)  # Reemplazar NaN por la media

In [11]:
# Preparación de los datos
X = df_limpio.drop('ITBU', axis=1)
y = df_limpio['ITBU'] - 1

In [12]:
# Normalización de los datos
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# OneHot encoding para la columna objetivo
y_encoded = to_categorical(y)

In [14]:
# Crear un Dataset usando tf.data
def get_dataset(X, y, batch_size=10240):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.cache().shuffle(len(X)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [15]:
# Convertir los datos a tf.data.Dataset
dataset = get_dataset(X_scaled, y_encoded, batch_size=10240 * strategy.num_replicas_in_sync)

2024-02-25 13:18:15.259368: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-25 13:18:15.260014: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [16]:
with strategy.scope():
    # Definición de la red neuronal
    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(512, activation='relu'),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(y_encoded.shape[1], activation='softmax')
    ])

In [17]:
# Compilación del modelo
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# Resumen del modelo
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               19456     
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 194082 (758.13 KB)
Trainable params: 19408

In [19]:
# Entrenamiento del modelo
callback = EarlyStopping(monitor='loss', patience=20)
history = model.fit(dataset, epochs=1000, callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [20]:
# Evaluación del modelo
score = model.evaluate(dataset)
print(f"Loss: {score[0]}, Accuracy: {score[1]}")

Loss: 0.2547682821750641, Accuracy: 0.8983736038208008


In [24]:
import pickle
import os  # Asegúrate de importar os

# Guarda el modelo
with open('modelo_log.pkl', 'wb') as file:
    pickle.dump(model, file)

# Verifica que el archivo se guardó correctamente
print(os.path.isfile('modelo_log.pkl'))

True


In [22]:
# df_limpio.info

In [23]:
# # Guarda el DataFrame en un archivo CSV
# df_limpio.to_csv('df_limpio.csv', index=False)

# # Verifica que el archivo se ha guardado correctamente
# print("Archivo guardado:", os.path.isfile('df_limpio.csv'))