# Preprocesamiento

## 1. Bibliotecas

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## 2. Datos

In [13]:
data = pd.read_csv("./../data/train.csv")

## 3. Limpieza y preprocesamiento
* Elección de columnas
* Cambio de tipo de variables
* Escalamiento de variables

In [14]:
def transform_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    # Select the required columns
    cols = [
        'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 
        'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 
        'EstimatedSalary', 'Exited'
    ]
    df = df[cols].copy()

    # One-hot encode the 'Geography' column
    df = pd.get_dummies(df, columns=['Geography'], prefix='Geo')

    # Convert 'Gender' to boolean. 
    # Here we assume 'Male' maps to True and 'Female' to False.
    df['Gender'] = df['Gender'].apply(lambda x: True if x == 'Male' else False)

    # Convert other binary columns to boolean
    bool_cols = ['HasCrCard', 'IsActiveMember', 'Exited']
    df[bool_cols] = df[bool_cols].astype(bool)

    # Ensure 'Age' is integer type
    df['Age'] = df['Age'].astype(int)

    # Scale 'Balance' and 'EstimatedSalary' using MinMaxScaler
    scaler = MinMaxScaler()
    df[['Balance', 'EstimatedSalary']] = scaler.fit_transform(df[['Balance', 'EstimatedSalary']])
    
    return df


In [15]:
def balance_downsample(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downsamples the majority class in the 'Exited' column to match the number of samples in the minority class.
    """
    # Determine majority and minority classes
    counts = df['Exited'].value_counts()
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()
    
    # Separate majority and minority samples
    df_majority = df[df['Exited'] == majority_class]
    df_minority = df[df['Exited'] == minority_class]
    
    # Downsample the majority class to the number of minority samples
    df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
    
    # Combine the downsampled majority with the minority class and shuffle the result
    df_balanced = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop=True)
    return df_balanced

def balance_oversample_resample(df: pd.DataFrame) -> pd.DataFrame:
    """
    Oversamples the minority class by duplicating existing rows using sklearn's resample.
    The minority class is upsampled to match the number of samples in the majority class.
    """
    # Determine majority and minority classes
    counts = df['Exited'].value_counts()
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()
    
    # Separate majority and minority samples
    df_majority = df[df['Exited'] == majority_class]
    df_minority = df[df['Exited'] == minority_class]
    
    # Oversample minority class by resampling with replacement
    df_minority_upsampled = resample(df_minority,
                                     replace=True,
                                     n_samples=len(df_majority),
                                     random_state=42)
    
    # Combine the original majority with the upsampled minority and shuffle the result
    df_balanced = pd.concat([df_majority, df_minority_upsampled]).reset_index(drop=True)
    return df_balanced

def balance_smote(df: pd.DataFrame) -> pd.DataFrame:
    """
    Balances the dataset using the SMOTE algorithm to generate synthetic samples for the minority class.
    Note: Ensure that all feature columns (i.e., all columns except 'Exited') are numeric.
    """
    # Separate features and target variable
    X = df.drop('Exited', axis=1)
    y = df['Exited']
    
    # Initialize SMOTE and perform the oversampling
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    
    # Recombine the features and the resampled target into a DataFrame
    df_res = pd.DataFrame(X_res, columns=X.columns)
    df_res['Exited'] = y_res
    return df_res


In [16]:
data['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [17]:
data = transform_dataframe(data)
data = balance_downsample(data)
# data = balance_oversample_resample(data)
# data = balance_smote(data)


In [18]:
data['Exited'].value_counts()

Exited
False    34921
True     34921
Name: count, dtype: int64

In [19]:
# Suppose 'data' is your DataFrame and 'Exited' is your target variable.
# Split the DataFrame into features (X) and target (y)
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split the features and target into training and testing sets.
# test_size=0.2 means 20% of the data will be used for testing.
# random_state ensures reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, if you need to combine the features and target back for train/test DataFrames:
train_data = X_train.copy()
train_data['Exited'] = y_train
test_data = X_test.copy()
test_data['Exited'] = y_test

print("Training set shape:", train_data.shape)
print("Testing set shape:", test_data.shape)


Training set shape: (55873, 13)
Testing set shape: (13969, 13)


In [None]:
# Build the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model with binary_crossentropy loss for a binary classification task
model.compile(
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
    loss='binary_crossentropy', 
    metrics=['accuracy', tf.keras.metrics.Recall()])

# Train the model; using a validation split from the training data for monitoring performance
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_accuracy, test_recall = model.evaluate(X_test, y_test, verbose=1)
print('Test Accuracy: {:.2f}%'.format(test_accuracy * 100))
print('Test Recall: {:.2f}%'.format(test_recall * 100))

# Generate predictions and a classification report
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1397/1397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.5039 - loss: 5.0011 - recall_1: 0.3989 - val_accuracy: 0.5074 - val_loss: 0.6930 - val_recall_1: 0.0000e+00
Epoch 2/100
[1m1397/1397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.4991 - loss: 0.6991 - recall_1: 0.2757 - val_accuracy: 0.4926 - val_loss: 0.6932 - val_recall_1: 1.0000
Epoch 3/100
[1m1397/1397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.4979 - loss: 0.6952 - recall_1: 0.4859 - val_accuracy: 0.4926 - val_loss: 0.6934 - val_recall_1: 1.0000
Epoch 4/100
[1m1397/1397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.4982 - loss: 0.6938 - recall_1: 0.8845 - val_accuracy: 0.5074 - val_loss: 0.6931 - val_recall_1: 0.0000e+00
Epoch 5/100
[1m1397/1397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.5021 - loss: 0.6939 - recall_1: 0.1611 - val_accuracy: 0.4926 - val_loss:

In [None]:
keys_list = list(history.history.keys())
recall = keys_list[2]
val_recall = keys_list[5]
# Plot training & validation accuracy values
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history[recall], label='Train Recall')
plt.plot(history.history[val_recall], label='Validation Recall')
plt.title('Model Recall')
plt.xlabel('Epoch')
plt.ylabel('Recall')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()


In [None]:
# Print the model summary
model.summary()

In [None]:
import tensorflow as tf

# Create a Recall metric object
recall_metric = tf.keras.metrics.Recall()

# Update the metric state with the true labels and predictions
recall_metric.update_state(y_test, y_pred)

# Get the result
recall_tf = recall_metric.result().numpy()
print('Recall (TensorFlow):', recall_tf)


In [None]:
X_train.shape[1]

In [None]:
history.history.keys()
