In [107]:
import pandas as pd
import torch
import numpy as np
from session_extraction import extract_users_data
from torch import nn, optim
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.exceptions import UndefinedMetricWarning
import torch.utils.data as data
import random

In [108]:
device = torch.device('cuda')
seed = 8888

torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [109]:
iteration_path = "iteration_3/"
deliveries_path = "../data/" + iteration_path + "raw/deliveries.jsonl"
products_path = "../data/" + iteration_path + "raw/products.jsonl"
sessions_path = "../data/" + iteration_path + "raw/sessions.jsonl"
users_path = "../data/" + iteration_path + "raw/users.jsonl"

deliveries_data = pd.read_json(deliveries_path, lines=True)
products_data = pd.read_json(products_path, lines=True)
sessions_data = pd.read_json(sessions_path, lines=True)
users_data = pd.read_json(users_path, lines=True)

In [110]:
df = extract_users_data(sessions_data, users_data, products_data)
df = df.drop(columns=["user_id"])
df

Unnamed: 0,expenses,products_bought,events_number,city
0,23400.85,49,256,Kraków
1,44677.14,60,331,Radom
2,16312.03,23,130,Radom
3,7273.05,16,86,Kraków
4,33412.55,44,318,Poznań
...,...,...,...,...
195,0.00,0,7,Warszawa
196,109.00,1,5,Warszawa
197,0.00,0,5,Poznań
198,78.96,2,6,Szczecin


In [111]:
categorical_values = pd.get_dummies(df["city"])
categorical_values

Unnamed: 0,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław
0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
195,0,0,0,0,0,1,0
196,0,0,0,0,0,1,0
197,0,0,1,0,0,0,0
198,0,0,0,0,1,0,0


In [112]:
numerical_values = df.drop(columns=["city"])
train_indices = np.random.rand(len(numerical_values))>0.1

In [113]:
numerical_data = torch.from_numpy(numerical_values.values[train_indices,1:]).float()
categorical_data = torch.from_numpy(categorical_values.values[train_indices]).float()
targets = torch.from_numpy(numerical_values.values[train_indices,0]).float()

validation_numerical_data = torch.from_numpy(numerical_values.values[~train_indices,1:]).float()
validation_categorical_data = torch.from_numpy(categorical_values.values[~train_indices]).float()
validation_targets = torch.from_numpy(numerical_values.values[~train_indices,0]).float()

In [114]:
train_dataset = data.TensorDataset(numerical_data,categorical_data,targets)
validation_dataset = data.TensorDataset(validation_numerical_data,validation_categorical_data,validation_targets)

In [115]:
class FlatsClassifier(nn.Module):
    def __init__(self, l2_size, l3_size):
        super().__init__()
        self.emb_layer = nn.Linear(categorical_data.shape[1], categorical_data.shape[1])
        self.act_emb = nn.Tanh()
        self.layer1 = nn.Linear(numerical_data.shape[1] + categorical_data.shape[1], l2_size)
        self.batch_norm1 = nn.BatchNorm1d(l2_size)
        self.act_1 =  nn.LeakyReLU()
        self.layer2 = nn.Linear(l2_size, l3_size)
        self.batch_norm2 = nn.BatchNorm1d(l3_size)
        self.act_2 =  nn.LeakyReLU()
        self.layer3 = nn.Linear(l3_size, 1)
        self.act_out = nn.Sigmoid()
    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        cat_x_embedded = self.act_emb(cat_x_embedded)
        x = torch.cat([x,cat_x_embedded],dim=1)
        activation1 = self.act_1(self.batch_norm1(self.layer1(x)))
        activation2 = self.act_2(self.batch_norm2(self.layer2(activation1)))
        output = self.act_out(self.layer3(activation2))
        return output

In [None]:
def get_accuracy(model, data_loader):
    model.eval()
    with torch.no_grad():
        for x, cat_x, targets in data_loader:
            x, cat_x, targets = x.to(device), cat_x.to(device), targets.to(device)
            preds = model(x, cat_x)
            try:
                roc_auc = roc_auc_score(targets.cpu().numpy(), preds.cpu().numpy())
            except ValueError:
                roc_auc = 0
    return roc_auc

def get_binary_accuracy(model, data_loader, binary_threshold):
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = (output>binary_threshold).float()
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [117]:
layer2_size = 10
layer3_size = 3
lr = 0.0002
epochs = 500
learning_threshold = 0.993

model = FlatsClassifier(layer2_size, layer3_size).to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=48, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=validation_numerical_data.shape[0], shuffle=False)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

iters = []
losses = []
train_acc = []
val_acc = []
val_acc_bin = []
for n in range(epochs):
    epoch_losses = []
    for x, cat_x, labels in iter(train_loader):
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        model.train()
        out = model(x, cat_x).squeeze()

        loss = criterion(out, labels)
        loss.backward()
        epoch_losses.append(loss.item())
        optimizer.step()
        optimizer.zero_grad()

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    validation_acc = get_accuracy(model, validation_loader)
    if n % 10 == 0:
      print(f"Epoch {n:>3}, loss {loss_mean:5.3f} -> validation_acc: {validation_acc:6.4f}")
    train_acc.append(get_accuracy(model, train_loader)) # compute training accuracy
    val_acc.append(validation_acc)  # compute validation accuracy
    if validation_acc > learning_threshold:
        break
print()
print("Final Validation Accuracy: {}".format(val_acc[-1]))

Epoch   0, loss -7009.348 -> validation_acc: 0.0000
Epoch  10, loss -7204.332 -> validation_acc: 0.0000
Epoch  20, loss -7283.968 -> validation_acc: 0.0000
Epoch  30, loss -7170.318 -> validation_acc: 0.0000
Epoch  40, loss -7273.767 -> validation_acc: 0.0000
Epoch  50, loss -7429.940 -> validation_acc: 0.0000
Epoch  60, loss -7629.101 -> validation_acc: 0.0000
Epoch  70, loss -7637.949 -> validation_acc: 0.0000
Epoch  80, loss -7700.447 -> validation_acc: 0.0000
Epoch  90, loss -8017.294 -> validation_acc: 0.0000
Epoch 100, loss -7957.667 -> validation_acc: 0.0000
Epoch 110, loss -8164.395 -> validation_acc: 0.0000
Epoch 120, loss -8095.050 -> validation_acc: 0.0000
Epoch 130, loss -8290.021 -> validation_acc: 0.0000
Epoch 140, loss -8454.882 -> validation_acc: 0.0000
Epoch 150, loss -8574.934 -> validation_acc: 0.0000
Epoch 160, loss -8728.867 -> validation_acc: 0.0000
Epoch 170, loss -8837.691 -> validation_acc: 0.0000
Epoch 180, loss -8798.468 -> validation_acc: 0.0000
Epoch 190, l