# Imports

In [14]:
!pip install category_encoders



In [15]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.impute import SimpleImputer
from category_encoders import CountEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score, accuracy_score

# Data preprocessing

In [16]:
data = pd.read_csv('/content/drive/MyDrive/dataS21/DontGetKicked/training.csv')

data['PurchDate'] = pd.to_datetime(data['PurchDate'])

data = data.sort_values('PurchDate')

n = len(data)
train_end = int(n * 0.33)
valid_end = int(n * 0.66)

train_data = data.iloc[:train_end].copy()
valid_data = data.iloc[train_end:valid_end].copy()
test_data = data.iloc[valid_end:].copy()

print(f"Train shape: {train_data.shape}")
print(f"Validation shape: {valid_data.shape}")
print(f"Test shape: {test_data.shape}")

Train shape: (24084, 34)
Validation shape: (24084, 34)
Test shape: (24815, 34)


In [17]:
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_data.select_dtypes(include=['object']).columns

target_col = 'IsBadBuy'
numeric_cols = numeric_cols.drop(target_col, errors='ignore')
categorical_cols = categorical_cols.drop(target_col, errors='ignore')

numeric_imputer = SimpleImputer(strategy='median')
train_data.loc[:, numeric_cols] = numeric_imputer.fit_transform(train_data[numeric_cols])
valid_data.loc[:, numeric_cols] = numeric_imputer.transform(valid_data[numeric_cols])
test_data.loc[:, numeric_cols] = numeric_imputer.transform(test_data[numeric_cols])

categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data.loc[:, categorical_cols] = categorical_imputer.fit_transform(train_data[categorical_cols])
valid_data.loc[:, categorical_cols] = categorical_imputer.transform(valid_data[categorical_cols])
test_data.loc[:, categorical_cols] = categorical_imputer.transform(test_data[categorical_cols])

In [18]:
for df in [train_data, valid_data, test_data]:
    df.drop(columns=['PurchDate'], inplace=True)

In [19]:
count_enc = CountEncoder()
count_enc.fit(train_data[categorical_cols])

train_data.loc[:, categorical_cols] = count_enc.transform(train_data[categorical_cols])
valid_data.loc[:, categorical_cols] = count_enc.transform(valid_data[categorical_cols])
test_data.loc[:, categorical_cols] = count_enc.transform(test_data[categorical_cols])

In [20]:
X_train, y_train = train_data.drop(columns=['IsBadBuy']).values, train_data['IsBadBuy'].values
X_valid, y_valid = valid_data.drop(columns=['IsBadBuy']).values, valid_data['IsBadBuy'].values
X_test, y_test = test_data.drop(columns=['IsBadBuy']).values, test_data['IsBadBuy'].values

In [21]:
X_train = X_train.astype(float)
X_valid = X_valid.astype(float)
X_test = X_test.astype(float)
y_train = y_train.astype(int)
y_valid = y_valid.astype(int)
y_test = y_test.astype(int)

In [22]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

# CustomMLP

In [23]:
class MLP:
    def __init__(self, n_hidden=100, activation='tanh', learning_rate=0.01, epochs=50,
                 batch_size=32, optimizer='adam', random_seed=42):
        """Initialize MLP with one hidden layer.

        Args:
            n_hidden (int): Number of neurons in the hidden layer.
            activation (str): Activation function for the hidden layer ('tanh', 'sigmoid', 'relu', 'cos').
            learning_rate (float): Learning rate.
            epochs (int): Number of epochs.
            batch_size (int): Batch size.
            optimizer (str): Optimizer ('sgd' or 'adam').
            random_seed (int): Seed for random number generator.
        """
        self.n_hidden = n_hidden
        self.activation = activation
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.random_seed = random_seed
        self.W1 = None
        self.b1 = None
        self.W2 = None
        self.b2 = None

    def _initialize_weights(self, n_features):
        """Initialize weights with small random numbers."""
        np.random.seed(self.random_seed)
        self.W1 = np.random.randn(n_features, self.n_hidden) * 0.01
        self.b1 = np.zeros((1, self.n_hidden))
        self.W2 = np.random.randn(self.n_hidden, 2) * 0.01
        self.b2 = np.zeros((1, 2))

    def _sigmoid(self, x):
        """Sigmoid activation function."""
        x = np.array(x, dtype=np.float64)
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

    def _activation(self, z):
        """Activation function for the hidden layer."""
        z = np.array(z, dtype=np.float64)
        if self.activation == 'tanh':
            return np.tanh(z)
        elif self.activation == 'sigmoid':
            return self._sigmoid(z)
        elif self.activation == 'relu':
            return np.maximum(0, z)
        elif self.activation == 'cos':
            return np.cos(z)
        else:
            raise ValueError("Unknown activation: choose 'tanh', 'sigmoid', 'relu', or 'cos'")

    def _activation_derivative(self, z, a):
        """The derivative of the activation function for the hidden layer."""
        z = np.array(z, dtype=np.float64)
        if self.activation == 'tanh':
            return 1 - a ** 2
        elif self.activation == 'sigmoid':
            return a * (1 - a)
        elif self.activation == 'relu':
            return np.where(z > 0, 1, 0)
        elif self.activation == 'cos':
            return -np.sin(z)
        else:
            raise ValueError("Unknown activation: choose 'tanh', 'sigmoid', 'relu', or 'cos'")

    def _forward_pass(self, X):
        """Direct distribution."""
        X = np.array(X, dtype=np.float64)
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self._activation(z1)
        z2 = np.dot(a1, self.W2) + self.b2
        a2 = self._sigmoid(z2)
        return z1, a1, z2, a2

    def _compute_loss(self, y_true, y_pred):
        """Calculating binary cross entropy."""
        y_true = np.array(y_true, dtype=int)
        batch_size = y_true.shape[0]
        y_true_one_hot = np.zeros((batch_size, 2))
        y_true_one_hot[np.arange(batch_size), y_true] = 1
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = -np.mean(y_true_one_hot * np.log(y_pred) + (1 - y_true_one_hot) * np.log(1 - y_pred))
        return loss

    def _backward_pass(self, X, y, z1, a1, z2, a2):
        """Backpropagation for computing gradients."""
        batch_size = X.shape[0]
        y_one_hot = np.zeros((batch_size, 2))
        y_one_hot[np.arange(batch_size), y] = 1

        dz2 = a2 - y_one_hot
        dW2 = np.dot(a1.T, dz2) / batch_size
        db2 = np.sum(dz2, axis=0, keepdims=True) / batch_size

        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self._activation_derivative(z1, a1)

        dW1 = np.dot(X.T, dz1) / batch_size
        db1 = np.sum(dz1, axis=0, keepdims=True) / batch_size

        return dW1, db1, dW2, db2

    def fit(self, X_train, y_train, X_valid=None, y_valid=None):
        """Training a model using SGD or Adam."""
        X_train = np.array(X_train, dtype=np.float64)
        y_train = np.array(y_train, dtype=int)
        if X_valid is not None:
            X_valid = np.array(X_valid, dtype=np.float64)
            y_valid = np.array(y_valid, dtype=int)

        n_samples, n_features = X_train.shape
        self._initialize_weights(n_features)

        if self.optimizer == 'adam':
            m_W1, v_W1 = np.zeros_like(self.W1), np.zeros_like(self.W1)
            m_b1, v_b1 = np.zeros_like(self.b1), np.zeros_like(self.b1)
            m_W2, v_W2 = np.zeros_like(self.W2), np.zeros_like(self.W2)
            m_b2, v_b2 = np.zeros_like(self.b2), np.zeros_like(self.b2)
            beta1, beta2, epsilon = 0.9, 0.999, 1e-8
            t = 0

        for epoch in range(self.epochs):
            indices = np.random.permutation(n_samples)
            X_train_shuffled = X_train[indices]
            y_train_shuffled = y_train[indices]

            for i in range(0, n_samples, self.batch_size):
                X_batch = X_train_shuffled[i:i + self.batch_size]
                y_batch = y_train_shuffled[i:i + self.batch_size]
                z1, a1, z2, a2 = self._forward_pass(X_batch)
                dW1, db1, dW2, db2 = self._backward_pass(X_batch, y_batch, z1, a1, z2, a2)

                if self.optimizer == 'sgd':
                    self.W1 -= self.learning_rate * dW1
                    self.b1 -= self.learning_rate * db1
                    self.W2 -= self.learning_rate * dW2
                    self.b2 -= self.learning_rate * db2
                elif self.optimizer == 'adam':
                    t += 1
                    m_W1 = beta1 * m_W1 + (1 - beta1) * dW1
                    v_W1 = beta2 * v_W1 + (1 - beta2) * (dW1 ** 2)
                    m_W1_hat = m_W1 / (1 - beta1 ** t)
                    v_W1_hat = v_W1 / (1 - beta2 ** t)
                    self.W1 -= self.learning_rate * m_W1_hat / (np.sqrt(v_W1_hat) + epsilon)
                    m_b1 = beta1 * m_b1 + (1 - beta1) * db1
                    v_b1 = beta2 * v_b1 + (1 - beta2) * (db1 ** 2)
                    m_b1_hat = m_b1 / (1 - beta1 ** t)
                    v_b1_hat = v_b1 / (1 - beta2 ** t)
                    self.b1 -= self.learning_rate * m_b1_hat / (np.sqrt(v_b1_hat) + epsilon)
                    m_W2 = beta1 * m_W2 + (1 - beta1) * dW2
                    v_W2 = beta2 * v_W2 + (1 - beta2) * (dW2 ** 2)
                    m_W2_hat = m_W2 / (1 - beta1 ** t)
                    v_W2_hat = v_W2 / (1 - beta2 ** t)
                    self.W2 -= self.learning_rate * m_W2_hat / (np.sqrt(v_W2_hat) + epsilon)
                    m_b2 = beta1 * m_b2 + (1 - beta1) * db2
                    v_b2 = beta2 * v_b2 + (1 - beta2) * (db2 ** 2)
                    m_b2_hat = m_b2 / (1 - beta1 ** t)
                    v_b2_hat = v_b2 / (1 - beta2 ** t)
                    self.b2 -= self.learning_rate * m_b2_hat / (np.sqrt(v_b2_hat) + epsilon)

            _, _, _, train_pred = self._forward_pass(X_train)
            train_loss = self._compute_loss(y_train, train_pred)

            if X_valid is not None and y_valid is not None:
                _, _, _, valid_pred = self._forward_pass(X_valid)
                valid_loss = self._compute_loss(y_valid, valid_pred)

    def predict_proba(self, X):
        """Predicting class probabilities."""
        X = np.array(X, dtype=np.float64)
        _, _, _, a2 = self._forward_pass(X)
        return a2

    def predict(self, X):
        """Predicting class labels."""
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

In [24]:
def gini_score(y_true, y_pred_proba):
    """Calculating the Gini coefficient based on ROC-AUC."""
    roc_auc = roc_auc_score(y_true, y_pred_proba[:, 1])
    return 2 * roc_auc - 1

In [25]:
results = []

In [26]:
activations = ['tanh', 'sigmoid', 'relu']
optimizers = ['adam', 'sgd']

for activation in activations:
    for optimizer in optimizers:
        model = MLP(
            n_hidden=128,
            activation=activation,
            learning_rate=0.01 if optimizer == 'adam' else 0.001,
            epochs=16,
            batch_size=64,
            optimizer=optimizer
        )
        model.fit(X_train, y_train, X_valid, y_valid)

        proba_valid = model.predict_proba(X_valid)
        predictions_valid = model.predict(X_valid)
        roc_auc_valid = roc_auc_score(y_valid, proba_valid[:, 1])
        gini_valid = gini_score(y_valid, proba_valid)
        valid_accuracy = accuracy_score(y_valid, predictions_valid)

        results.append({
            'Model': 'CustomMLP',
            'Activation': activation,
            'Optimizer': optimizer,
            'Validation Accuracy': valid_accuracy,
            'Validation ROC-AUC': roc_auc_valid,
            'Validation Gini': gini_valid,
        })

# MLPClassifier

In [27]:
def gini_score(y_true, y_pred_proba):
    """Calculating the Gini coefficient based on ROC-AUC."""
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    return 2 * roc_auc - 1

In [28]:
activations = ['tanh', 'logistic', 'relu']
solvers = ['adam', 'sgd']

for activation in activations:
    for solver in solvers:
        model = MLPClassifier(
            hidden_layer_sizes=(128,),
            activation=activation,
            solver=solver,
            learning_rate_init=0.001 if solver == 'adam' else 0.01,
            max_iter=200 if solver == 'adam' else 500,
            batch_size=64,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=20
        )
        model.fit(X_train, y_train)

        proba_valid = model.predict_proba(X_valid)[:, 1]
        predictions_valid = model.predict(X_valid)
        roc_auc_valid = roc_auc_score(y_valid, proba_valid)
        gini_valid = gini_score(y_valid, proba_valid)
        valid_accuracy = accuracy_score(y_valid, predictions_valid)

        results.append({
            'Model': 'MLPClassifier',
            'Activation': activation,
            'Optimizer': solver,
            'Validation Accuracy': valid_accuracy,
            'Validation ROC-AUC': roc_auc_valid,
            'Validation Gini': gini_valid,
        })

# PyTorchMLP

In [29]:
class PyTorchMLP(nn.Module):
    def __init__(self, input_size, hidden_size=100, activation='tanh', learning_rate=0.001,
                 epochs=200, batch_size=128, optimizer='adam', random_seed=42):
        super(PyTorchMLP, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.activation = activation
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer_name = optimizer
        self.random_seed = random_seed

        self.fc1 = nn.Linear(input_size, hidden_size)
        if activation == 'tanh':
            self.activation_fn = nn.Tanh()
        elif activation == 'sigmoid':
            self.activation_fn = nn.Sigmoid()
        elif activation == 'relu':
            self.activation_fn = nn.ReLU()
        else:
            raise ValueError("Unknown activation: choose 'tanh', 'sigmoid', or 'relu'")
        self.fc2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

        torch.manual_seed(random_seed)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation_fn(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

    def _prepare_data(self, X, y=None):
        """Converting data into PyTorch tensors."""
        X = np.array(X, dtype=np.float64)
        if y is not None:
            y = np.array(y, dtype=np.int64).reshape(-1, 1)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        if y is not None:
            y_tensor = torch.tensor(y, dtype=torch.float32)
            return X_tensor, y_tensor
        return X_tensor

    def fit(self, X_train, y_train, X_valid=None, y_valid=None):
        """Training the model."""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)

        X_train_tensor, y_train_tensor = self._prepare_data(X_train, y_train)
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        if X_valid is not None and y_valid is not None:
            X_valid_tensor, y_valid_tensor = self._prepare_data(X_valid, y_valid)
        else:
            X_valid_tensor, y_valid_tensor = None, None

        if self.optimizer_name == 'adam':
            optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        elif self.optimizer_name == 'sgd':
            optimizer = optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9)
        else:
            raise ValueError("Unknown optimizer: choose 'adam' or 'sgd'")

        criterion = nn.BCELoss()

        for epoch in range(self.epochs):
            self.train()
            train_loss = 0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                outputs = self(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * X_batch.size(0)
            train_loss /= len(train_loader.dataset)

            if X_valid_tensor is not None and y_valid_tensor is not None:
                self.eval()
                with torch.no_grad():
                    outputs_valid = self(X_valid_tensor.to(device))
                    valid_loss = criterion(outputs_valid, y_valid_tensor.to(device)).item()

        return self

    def predict_proba(self, X):
        """Predicting Probabilities."""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.eval()
        X_tensor = self._prepare_data(X)
        with torch.no_grad():
            proba = self(X_tensor.to(device)).cpu().numpy().flatten()
        return np.vstack([1 - proba, proba]).T

    def predict(self, X):
        """Predicting class labels."""
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.5).astype(int)

In [30]:
activations = ['tanh', 'sigmoid', 'relu']
optimizers = ['adam', 'sgd']

for activation in activations:
    for opt in optimizers:
        model = PyTorchMLP(
            input_size=X_train.shape[1],
            hidden_size=128,
            activation=activation,
            learning_rate=0.001 if opt == 'adam' else 0.01,
            epochs=16,
            batch_size=64,
            optimizer=opt
        )
        model.fit(X_train, y_train, X_valid, y_valid)

        proba_valid = model.predict_proba(X_valid)[:, 1]
        predictions_valid = model.predict(X_valid)
        roc_auc_valid = roc_auc_score(y_valid, proba_valid)
        gini_valid = gini_score(y_valid, proba_valid)
        valid_accuracy = accuracy_score(y_valid, predictions_valid)

        results.append({
            'Model': 'PyTorchMLP',
            'Activation': activation,
            'Optimizer': opt,
            'Validation Accuracy': valid_accuracy,
            'Validation ROC-AUC': roc_auc_valid,
            'Validation Gini': gini_valid,
        })

PyTorchMLP is expected to perform better due to:

1. Automatic gradient computation via Autograd
2. Efficient batch processing via DataLoader
3. Numerical stability due to PyTorch's built-in mechanisms

# Results

In [31]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Activation,Optimizer,Validation Accuracy,Validation ROC-AUC,Validation Gini
0,CustomMLP,tanh,adam,0.868626,0.666855,0.33371
1,CustomMLP,tanh,sgd,0.868876,0.643507,0.287014
2,CustomMLP,sigmoid,adam,0.869042,0.67496,0.349919
3,CustomMLP,sigmoid,sgd,0.868876,0.63473,0.269459
4,CustomMLP,relu,adam,0.86846,0.679959,0.359919
5,CustomMLP,relu,sgd,0.868876,0.620606,0.241213
6,MLPClassifier,tanh,adam,0.869249,0.685451,0.370903
7,MLPClassifier,tanh,sgd,0.869042,0.670666,0.341331
8,MLPClassifier,logistic,adam,0.868834,0.658606,0.317213
9,MLPClassifier,logistic,sgd,0.868876,0.650164,0.300328


In [32]:
model = PyTorchMLP(
    input_size=X_train.shape[1],
    hidden_size=128,
    activation='relu',
    learning_rate=0.001,
    epochs=16,
    batch_size=64,
    optimizer='adam'
)
model.fit(X_train, y_train, X_valid, y_valid)

proba_train = model.predict_proba(X_train)[:, 1]
gini_train = gini_score(y_train, proba_train)
print(f"Training Gini: {gini_train:.4f}")

proba_valid = model.predict_proba(X_valid)[:, 1]
gini_valid = gini_score(y_valid, proba_valid)
print(f"Validation Gini: {gini_valid:.4f}")

proba_test = model.predict_proba(X_test)[:, 1]
gini_test = gini_score(y_test, proba_test)
print(f"Test Gini: {gini_test:.4f}")

Training Gini: 0.4931
Validation Gini: 0.3653
Test Gini: 0.4209


There are dips in the metric on the validation dataset, but on the test dataset it increases again, which indicates the absence of overfitting.