In [73]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import torch
import sklearn
import pytorch_lightning as pl
from torch.utils.data import random_split
from torch.optim import AdamW, Adam
from torchmetrics import AUROC
from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel
from torch.autograd import Variable
from tqdm.autonotebook import trange
import sklearn.metrics as metrica
from torch.optim import lr_scheduler
import warnings
warnings.filterwarnings("ignore")

In [2]:
transactions = pd.read_csv('transactions.csv')
train = pd.read_csv('train.csv')
mcc = pd.read_csv('mcc_codes.csv', sep = ';')
trans_types = pd.read_csv('trans_types.csv', sep = ';')


In [3]:
transactions_merged = pd.merge(transactions, train, on=['client_id'])

In [4]:
transactions_merged['Day'] = transactions_merged['trans_time'].str[:-9].astype('int')
transactions_merged.drop(columns = ['Unnamed: 0'], inplace = True)
transactions_merged['Hours'] = transactions['trans_time'].str[-2:].astype('int')/3600 + transactions['trans_time'].str[-5:-3].astype('int')/60 + transactions['trans_time'].str[-8:-6].astype('int')

In [5]:

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()
def embed(row):
    return embed_bert_cls(row, model, tokenizer)


In [6]:
mcc_embed = np.array([embed(mcc['mcc_description'].values[i]) 
                      for i in range(len(mcc['mcc_description']))], dtype=np.float32)

ttp_embed = np.array([embed(mcc['mcc_description'].values[i]) 
                      for i in range(len(mcc['mcc_description']))], dtype=np.float32)

In [7]:
mcc['embed_mcc'] = mcc['mcc_description'].apply(embed)
trans_types['embed_trans_type'] = trans_types['trans_description'].apply(embed)

In [8]:
mcc

Unnamed: 0,mcc_code,mcc_description,embed_mcc
0,742,Ветеринарные услуги,"[-0.0013893048, -0.022407576, 0.020457322, -0...."
1,1711,"Генеральные подрядчики по вентиляции, теплосна...","[0.042593464, 0.03830093, -0.011149202, -0.077..."
2,1731,Подрядчики по электричеству,"[-0.068887375, 0.004222398, 0.0019334967, -0.0..."
3,1799,"Подрядчики, специализированная торговля — нигд...","[0.056087583, -0.043310516, -0.049377777, -0.1..."
4,2741,Разнообразные издательства/печатное дело,"[0.0061180755, -0.0080110645, 0.0036462303, -0..."
...,...,...,...
179,9211,"Судовые выплаты, включая алименты и детскую по...","[0.06675858, -0.034593407, -0.048400972, -0.07..."
180,9222,Штрафы,"[-0.036337346, -0.041523345, 0.013427289, -0.0..."
181,9311,Налоговые платежи,"[0.018681997, -0.04539055, -0.023520647, -0.03..."
182,9399,"Правительственные услуги, нигде ранее не класс...","[0.006963337, -0.060759354, -0.030807396, -0.1..."


In [9]:
transactions_merged.mcc_code

0          4829
1          4829
2          4814
3          6011
4          6011
           ... 
3238727    4814
3238728    6011
3238729    6011
3238730    6011
3238731    4829
Name: mcc_code, Length: 3238732, dtype: int64

In [10]:
transactions_merged = pd.merge(transactions_merged, mcc, on=['mcc_code'])
transactions_merged = pd.merge(transactions_merged, trans_types, on=['trans_type'])

In [11]:
df_explore = transactions_merged[['amount', 'Hours','embed_mcc','embed_trans_type','gender']].fillna(0)
df_explore

Unnamed: 0,amount,Hours,embed_mcc,embed_trans_type,gender
0,-1808.56,8.411389,"[-0.018846657, -0.02290359, -0.02400488, -0.02...","[-0.026591301, -0.021923874, -0.09033733, -0.0...",0
1,-3390.41,13.454167,"[-0.018846657, -0.02290359, -0.02400488, -0.02...","[-0.026591301, -0.021923874, -0.09033733, -0.0...",0
2,-6155.97,11.474167,"[-0.018846657, -0.02290359, -0.02400488, -0.02...","[-0.026591301, -0.021923874, -0.09033733, -0.0...",0
3,-1447.13,20.142778,"[-0.018846657, -0.02290359, -0.02400488, -0.02...","[-0.026591301, -0.021923874, -0.09033733, -0.0...",0
4,-724.03,9.556944,"[-0.018846657, -0.02290359, -0.02400488, -0.02...","[-0.026591301, -0.021923874, -0.09033733, -0.0...",0
...,...,...,...,...,...
3237368,48244.41,10.645000,"[-0.015917236, -0.025938924, -0.062061, -0.078...","[0.0034363645, -0.027133552, -0.07808184, -0.0...",0
3237369,18365.62,8.228056,"[-0.015917236, -0.025938924, -0.062061, -0.078...","[0.0034363645, -0.027133552, -0.07808184, -0.0...",0
3237370,11407.09,12.018056,"[-0.015917236, -0.025938924, -0.062061, -0.078...","[0.0034363645, -0.027133552, -0.07808184, -0.0...",1
3237371,10023.46,17.761389,"[-0.015917236, -0.025938924, -0.062061, -0.078...","[0.0034363645, -0.027133552, -0.07808184, -0.0...",1


In [12]:
from tqdm import tqdm
import pickle 
with open('mcc.pickle', 'rb') as handle:
    embed_mcc = pickle.load(handle)
    
with open('tt.pickle', 'rb') as handle:
    embed_tt = pickle.load(handle)

In [35]:
import pickle 
with open('mcc.pickle', 'wb+') as handle:
    pickle.dump(embed_mcc, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('tt.pickle', 'wb+') as handle:
    pickle.dump(embed_tt, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
embed_tt = np.array(embed_tt, dtype=np.float32)

In [14]:
embed_mcc.shape

(3237373, 312)

In [13]:
X = df_explore.drop('gender', axis=1)
y = df_explore['gender']


In [14]:


class Data(pl.LightningDataModule):
    def __init__(self, batch_size:int, nums:np.array, emb1, emb2, y:np.array, val_split=0.2):
        super().__init__()
        self.batch_size = batch_size
        self.num_classes = 2
        self.embed1 = torch.tensor(emb1).to(torch.float32)
        self.embed2 = torch.tensor(emb2).to(torch.float32)
        self.numeric = torch.tensor(nums).to(torch.float32)
        self.y = torch.tensor(y).to(torch.float32)
        self.val_split = val_split
        
    
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            generator = torch.Generator().manual_seed(42)
            dataset = TensorDataset(self.numeric, self.embed1, self.embed2, self.y)
            self.train, self.val = random_split(dataset, 
                    [int(len(dataset)*(1 - self.val_split)), int(len(dataset)*self.val_split)], generator=generator)
    
    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.batch_size)


In [15]:
data = Data(128, X[['amount', 'Hours']].values[:2000000], embed_mcc[:2000000], embed_tt[:2000000], y.values[:2000000])
data.setup(stage='fit')

In [18]:
from torch.optim import AdamW, Adam
from torchmetrics import AUROC

class LTTwoEncoders(pl.LightningModule):
    def __init__(self, lr=1e-5, loss=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.lr = lr
        self.loss = loss
        self.save_hyperparameters()
        self.auroc = AUROC(task='binary')
        
        self.num_mlp = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU()
        )
        self.mcc_mlp = nn.Sequential(
            nn.Linear(312, 128),
            nn.ReLU()
        )
        self.tt_mlp = nn.Sequential(
            nn.Linear(312, 128),
            nn.ReLU()
        )
        
        self.mlp = nn.Sequential(
            nn.Linear(128 * 3, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, num, mcc, tt):
        num = self.num_mlp(num)
        mcc = self.mcc_mlp(mcc)
        tt = self.tt_mlp(tt)
        inp = torch.cat([num, mcc, tt], 1)
        outp = self.mlp(inp)
        return outp
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr = self.lr)
    
    def training_step(self, train_batch, batch_idx):
        num, mcc, tt, y = train_batch
        logits = self.forward(num, mcc, tt)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("train_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("train_auroc", self.auroc(logits, y1))
        return loss
    
    def validation_step(self, valid_batch, batch_idx):
        num, mcc, tt, y = valid_batch
        logits = self.forward(num, mcc, tt)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("val_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("val_auroc", self.auroc(logits, y1))
        
torch.set_float32_matmul_precision('high')

In [None]:
clf = LTTwoEncoders(lr=1e-5)
trainer = pl.Trainer(default_root_dir = '.', accelerator="auto", devices="auto", max_epochs=5)
trainer.fit(clf, data)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | auroc   | BinaryAUROC       | 0     
2 | num_mlp | Sequential        | 384   
3 | mcc_mlp | Sequential        | 40.1 K
4 | tt_mlp  | Sequential        | 40.1 K
5 | mlp     | Sequential        | 115 K 
----------------------------------------------
195 K     Trainable params
0         Non-trainable params
195 K     Total params
0.782     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
bin_auroc = AUROC(task='binary')
mets = []
for num, mcc, tt, y in tqdm(data.val_dataloader()):
    preds = clf(num, mcc, tt)
    met = bin_auroc(preds, y)
    mets.append(met.cpu().detach().numpy())

In [33]:
np.array(mets).mean()

0.60072273

In [128]:
mcc_freq = pd.read_csv("alt_data/mcc_frequency.csv")
ttp_freq = pd.read_csv("alt_data/trans_types_frequency.csv")
incomes  = pd.read_csv("alt_data/Incomes.csv")
outcomes = pd.read_csv("alt_data/Outcomes.csv")
genders  = pd.read_csv("alt_data/genders.csv")
inc_hour = pd.read_csv("alt_data/Incomes_at_hour.csv")
out_hour = pd.read_csv("alt_data/outcomes_at_hours.csv")
cities   = pd.read_csv("alt_data/Citites distribution.csv")

In [129]:
cities = cities.set_index("Unnamed: 0")
cities

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
c5f18a1d1d83fda62e51e6ae1d7b8911,0,0,0,0,0,0,0,1,0,0
4b6861803556de90249e13d7cf66f0a9,0,0,0,0,0,0,1,0,0,0
7d3f908e1cd1e3624a1445df5e04c4fa,0,0,0,0,0,0,1,0,0,0
40c585a809c1bd17dc6d283d61acd2d2,0,0,0,0,0,0,1,0,0,0
5bd2ff9f6c591353577488b0dfca6dea,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
596baadc284cb94a91c1fdd7a422a022,0,0,0,0,0,0,0,0,0,1
9b317a4cff6bb3b630cd966f68db410b,0,0,0,0,0,0,0,0,1,0
eb3b544e3c5293739f09dcd71a649b18,0,0,1,0,0,0,0,0,0,0
e1c1a70bfcb000d1188745cef71b3b61,0,0,0,0,0,1,0,0,0,0


In [130]:
mcc_freq = mcc_freq.drop("Unnamed: 0", axis=1)
mcc_freq.sum().values

array([630, 605, 386, ..., 406, 142, 168], dtype=int64)

In [131]:
mcc_freq /= mcc_freq.sum().values

In [132]:
mcc_freq = mcc_freq.transpose()
mcc_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,174,175,176,177,178,179,180,181,182,183
c5f18a1d1d83fda62e51e6ae1d7b8911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.001587,0.0,0.0,0.0,0.000000,0.0
4b6861803556de90249e13d7cf66f0a9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
7d3f908e1cd1e3624a1445df5e04c4fa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.005181,0.0,0.0,0.0,0.000000,0.0
40c585a809c1bd17dc6d283d61acd2d2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
5bd2ff9f6c591353577488b0dfca6dea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.028736,0.0,0.0,0.0,0.002874,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596baadc284cb94a91c1fdd7a422a022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002169,0.0,0.0,0.0,0.004338,0.0,0.0,0.0,0.000000,0.0
9b317a4cff6bb3b630cd966f68db410b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
eb3b544e3c5293739f09dcd71a649b18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
e1c1a70bfcb000d1188745cef71b3b61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.021127,0.0,0.0,0.0,0.000000,0.0


In [133]:
def preprocess(data):
    eps = 1e-5
    data = data.set_index("Unnamed: 0").T
    scale = np.abs(data.sum().values)
    data = (data / (scale + eps)).T
    return data

In [134]:
ttp_freq  = preprocess(ttp_freq)
incomes   = preprocess(incomes )
outcomes  = preprocess(outcomes)
incs_hour = preprocess(inc_hour)
outs_hour = preprocess(out_hour)

In [136]:
incs_hour

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c5f18a1d1d83fda62e51e6ae1d7b8911,0.025599,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029013,0.061041,...,0.014849,0.046398,0.171734,0.039252,0.033067,0.164689,0.055041,0.057605,0.055465,0.021334
4b6861803556de90249e13d7cf66f0a9,0.005294,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.139680,0.000000,...,0.000000,0.029409,0.198930,0.128387,0.078408,0.103919,0.001377,0.049004,0.000000,0.000000
7d3f908e1cd1e3624a1445df5e04c4fa,0.000000,0.000000,0.000000,0.000000,0.000000,0.018742,0.018757,0.000000,0.085752,0.075921,...,0.056222,0.010308,0.014527,0.004689,0.107790,0.023439,0.000000,0.028106,0.000000,0.000000
40c585a809c1bd17dc6d283d61acd2d2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.024167,...,0.000000,0.000000,0.010013,0.048348,0.000000,0.338087,0.000000,0.193100,0.000000,0.000000
5bd2ff9f6c591353577488b0dfca6dea,0.005249,0.000392,0.000000,0.000000,0.000000,0.000000,0.011934,0.000533,0.012168,0.089142,...,0.006825,0.006014,0.001700,0.054325,0.005241,0.097697,0.011804,0.034935,0.000596,0.000086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596baadc284cb94a91c1fdd7a422a022,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.036622,0.122798,...,0.190776,0.107238,0.051409,0.047996,0.031114,0.006618,0.000000,0.000000,0.000000,0.000000
9b317a4cff6bb3b630cd966f68db410b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.139806,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.016254,0.154557,0.000000,0.000000,0.000000
eb3b544e3c5293739f09dcd71a649b18,0.000000,0.000000,0.005852,0.274559,0.087678,0.158537,0.051125,0.007302,0.189959,0.008743,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
e1c1a70bfcb000d1188745cef71b3b61,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028005,0.000000,...,0.115298,0.028017,0.028001,0.048407,0.000000,0.067220,0.543856,0.082608,0.000000,0.000000


In [139]:
print(mcc_freq.shape)
print(ttp_freq.shape)
print( incomes.shape)
print(outcomes.shape)
print(incs_hour.shape)
print(outs_hour.shape)
print(cities.shape)

(7560, 184)
(7560, 155)
(7560, 184)
(7560, 184)
(7560, 24)
(7560, 24)
(7560, 10)


In [158]:
class AltData(pl.LightningDataModule):
    def __init__(self, batch_size:int, mcc, ttp, inc, out, inc_h, out_h, cities, y:np.array, val_split=0.2):
        super().__init__()
        self.batch_size = batch_size
        self.num_classes = 2
        self.mcc = torch.tensor(mcc).to(torch.float32)
        self.ttp = torch.tensor(ttp).to(torch.float32)
        self.inc = torch.tensor(inc).to(torch.float32)
        self.out = torch.tensor(out).to(torch.float32)
        self.inc_h = torch.tensor(inc_h).to(torch.float32)
        self.out_h = torch.tensor(out_h).to(torch.float32)
        self.cities = torch.tensor(cities).to(torch.float32)
        self.features = [self.mcc, self.ttp, self.inc, self.out, self.inc_h, self.out_h, self.cities]
        
        self.y = torch.tensor(y).to(torch.float32)
        self.val_split = val_split
        
    
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            generator = torch.Generator().manual_seed(42)
            dataset = TensorDataset(*self.features, self.y)
            self.train, self.val = random_split(dataset, 
                    [int(len(dataset)*(1 - self.val_split)), int(len(dataset)*self.val_split)], generator=generator)
    
    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.batch_size)


In [159]:
alt_data = AltData(512, mcc_freq.values, ttp_freq.values, 
                   incomes.values, outcomes.values, incs_hour.values, outs_hour.values, 
                   cities.values, genders.values[:, 1].astype(np.float32))

In [240]:
from torch.optim import AdamW, Adam
from torchmetrics import AUROC
from torch.optim.lr_scheduler import ReduceLROnPlateau

class AltModel(pl.LightningModule):
    def __init__(self, lr=1e-5, loss=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.lr = lr
        self.loss = loss
        self.save_hyperparameters()
        self.auroc = AUROC(task='binary')
        
        self.inc_mlp = nn.Sequential(
            nn.Linear(184, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
        )
        
        self.out_mlp = nn.Sequential(
            nn.Linear(184, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        
        self.inch_mlp = nn.Sequential(
            nn.Linear(24, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32)
        )
        
        self.outh_mlp = nn.Sequential(
            nn.Linear(24, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32)
        )
        
        self.mcc_mlp = nn.Sequential(
            nn.Linear(184, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        self.tt_mlp = nn.Sequential(
            nn.Linear(155, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        
        self.city_mlp = nn.Sequential(
            nn.Linear(10, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16)
        )
        
        
        self.mlp = nn.Sequential(
            nn.Linear(64 * 4 + 32 * 2 + 16 * 1, 128),
            nn.Dropout(p=0.3), 
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2), 
            nn.Linear(64, 1)
        )
        
    def forward(self, mcc, ttp, inc, out, inc_h, out_h, cities):
        mcc = self.mcc_mlp(mcc)
        ttp = self.tt_mlp(ttp)
        inc = self.inc_mlp(inc)
        out = self.out_mlp(out)
        inc_h = self.inch_mlp(inc_h)
        out_h = self.outh_mlp(out_h)
        cities = self.city_mlp(cities)
        inp = torch.cat([mcc, ttp, inc, out, inc_h, out_h, cities], 1)
        outp = self.mlp(inp)
        return outp
    
    
    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr = self.lr)
        scheduler = ReduceLROnPlateau(optimizer, factor=0.2, cooldown=1)
        return  {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

    
    def training_step(self, train_batch, batch_idx):
        mcc, ttp, inc, out, inc_h, out_h, cities, y = train_batch
        logits = self.forward(mcc, ttp, inc, out, inc_h, out_h, cities)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("train_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("train_auroc", self.auroc(logits, y1))
        return loss
    
    
    def validation_step(self, valid_batch, batch_idx):
        mcc, ttp, inc, out, inc_h, out_h, cities, y = valid_batch
        logits = self.forward(mcc, ttp, inc, out, inc_h, out_h, cities)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("val_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("val_auroc", self.auroc(logits, y1))
        

In [243]:
clf = AltModel(lr=5e-3)
trainer = pl.Trainer(default_root_dir = './alter', accelerator="auto", devices="auto", max_epochs=200)
trainer.fit(clf, alt_data)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type              | Params
-----------------------------------------------
0 | loss     | BCEWithLogitsLoss | 0     
1 | auroc    | BinaryAUROC       | 0     
2 | inc_mlp  | Sequential        | 12.0 K
3 | out_mlp  | Sequential        | 12.0 K
4 | inch_mlp | Sequential        | 864   
5 | outh_mlp | Sequential        | 864   
6 | mcc_mlp  | Sequential        | 12.0 K
7 | tt_mlp   | Sequential        | 10.1 K
8 | city_mlp | Sequential        | 208   
9 | mlp      | Sequential        | 51.5 K
-----------------------------------------------
99.4 K    Trainable params
0         Non-trainable params
99.4 K    Total params
0.398     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=200` reached.


In [222]:
trained_model = AltModel.load_from_checkpoint("./alter/lightning_logs/version_23/checkpoints/epoch=199-step=2400.ckpt")

RuntimeError: Error(s) in loading state_dict for AltModel:
	Missing key(s) in state_dict: "inc_mlp.2.weight", "inc_mlp.2.bias", "inc_mlp.2.running_mean", "inc_mlp.2.running_var", "out_mlp.2.weight", "out_mlp.2.bias", "out_mlp.2.running_mean", "out_mlp.2.running_var", "inch_mlp.2.weight", "inch_mlp.2.bias", "inch_mlp.2.running_mean", "inch_mlp.2.running_var", "outh_mlp.2.weight", "outh_mlp.2.bias", "outh_mlp.2.running_mean", "outh_mlp.2.running_var", "mcc_mlp.2.weight", "mcc_mlp.2.bias", "mcc_mlp.2.running_mean", "mcc_mlp.2.running_var", "tt_mlp.2.weight", "tt_mlp.2.bias", "tt_mlp.2.running_mean", "tt_mlp.2.running_var", "city_mlp.2.weight", "city_mlp.2.bias", "city_mlp.2.running_mean", "city_mlp.2.running_var". 
	size mismatch for inc_mlp.0.weight: copying a param with shape torch.Size([32, 184]) from checkpoint, the shape in current model is torch.Size([64, 184]).
	size mismatch for inc_mlp.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for out_mlp.0.weight: copying a param with shape torch.Size([32, 184]) from checkpoint, the shape in current model is torch.Size([64, 184]).
	size mismatch for out_mlp.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for mcc_mlp.0.weight: copying a param with shape torch.Size([32, 184]) from checkpoint, the shape in current model is torch.Size([64, 184]).
	size mismatch for mcc_mlp.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for tt_mlp.0.weight: copying a param with shape torch.Size([32, 155]) from checkpoint, the shape in current model is torch.Size([64, 155]).
	size mismatch for tt_mlp.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for mlp.0.weight: copying a param with shape torch.Size([128, 208]) from checkpoint, the shape in current model is torch.Size([128, 336]).

In [66]:
class AltModel2(pl.LightningModule):
    def __init__(self, lr=1e-5, loss=nn.BCEWithLogitsLoss()):
        super().__init__()
        self.lr = lr
        self.loss = loss
        self.save_hyperparameters()
        self.auroc = AUROC(task='binary')
        
        self.inc_mlp = nn.Sequential(
            nn.Linear(184, 32),
            nn.ReLU()
        )
        
        self.out_mlp = nn.Sequential(
            nn.Linear(184, 32),
            nn.ReLU()
        )
        
        self.inch_mlp = nn.Sequential(
            nn.Linear(24, 16),
            nn.ReLU()
        )
        
        self.outh_mlp = nn.Sequential(
            nn.Linear(24, 16),
            nn.ReLU()
        )
        
        self.mcc_mlp = nn.Sequential(
            nn.Linear(184, 32),
            nn.ReLU()
        )
        self.tt_mlp = nn.Sequential(
            nn.Linear(155, 32),
            nn.ReLU()
        )
        
        self.city_mlp = nn.Sequential(
            nn.Linear(10, 16),
            nn.ReLU()
        )
        
        
        self.mlp = nn.Sequential(
            nn.Linear(32 * 4 + 16 * 3, 256),
            nn.Dropout(p=0.5), 
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.Dropout(p=0.3), 
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2), 
            nn.Linear(64, 1)
        )
        
    def forward(self, mcc, ttp, inc, out, inc_h, out_h, cities):
        
        mcc = self.mcc_mlp(mcc)
        ttp = self.tt_mlp(ttp)
        inc = self.inc_mlp(inc)
        out = self.out_mlp(out)
        inc_h = self.inch_mlp(inc_h)
        out_h = self.outh_mlp(out_h)
        cities = self.city_mlp(cities)
        inp = torch.cat([mcc, ttp, inc, out, inc_h, out_h, cities], 1)
        outp = self.mlp(inp)
        return outp
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr = self.lr)
    
    def training_step(self, train_batch, batch_idx):
        mcc, ttp, inc, out, inc_h, out_h, cities, y = train_batch
        logits = self.forward(mcc, ttp, inc, out, inc_h, out_h, cities)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("train_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("train_auroc", self.auroc(logits, y1))
        return loss
    
    def validation_step(self, valid_batch, batch_idx):
        mcc, ttp, inc, out, inc_h, out_h, cities, y = valid_batch
        logits = self.forward(mcc, ttp, inc, out, inc_h, out_h, cities)
        loss = self.loss(logits, y.view(-1, 1))
        self.log("val_loss", loss)
        y1 = y.clone().to(torch.int32).view(-1, 1)
        self.log("val_auroc", self.auroc(logits, y1))

In [None]:
embed_mcc_cols = df_explore['embed_mcc'].apply(pd.Series)
y = df_explore['gender']


In [None]:
embed_trans_type_cols = df_explore['embed_trans_type'].apply(pd.Series)
X = pd.concat([embed_mcc_cols, embed_trans_type_cols, df_explore[['amount', 'Hours']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
embed_mcc_cols

In [20]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [None]:
xgb_config = {'use_gpu': True}
model = xgb.XGBClassifier(**xgb_config)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", auc_score)

In [22]:
import gc
gc.collect()

818

In [2]:
torch.cuda.is_available()

True