In [293]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import seaborn  as sns
import random as rd
from torch.utils.data import Dataset,DataLoader
import matplotlib.pyplot as plt
import sklift.metrics as lift_metrics
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [2]:
pd.options.mode.copy_on_write = True

In [3]:
DATA = pd.read_csv('~/all_data/uni_data/train (1).csv')

In [4]:
DATA = DATA.drop('id',axis = 1)

In [5]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 52 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   treatment_group  600000 non-null  object 
 1   X_1              600000 non-null  float64
 2   X_2              600000 non-null  float64
 3   X_3              600000 non-null  float64
 4   X_4              600000 non-null  float64
 5   X_5              600000 non-null  float64
 6   X_6              600000 non-null  float64
 7   X_7              600000 non-null  float64
 8   X_8              600000 non-null  float64
 9   X_9              600000 non-null  float64
 10  X_10             600000 non-null  float64
 11  X_11             600000 non-null  float64
 12  X_12             600000 non-null  float64
 13  X_13             600000 non-null  float64
 14  X_14             600000 non-null  float64
 15  X_15             600000 non-null  float64
 16  X_16             600000 non-null  floa

In [10]:
#stratify by conversion
X_train,X_test = train_test_split(DATA,train_size = 0.825,stratify = DATA['conversion'])

In [11]:
print(len(X_train))
print(len(X_test))

495000
105000


In [12]:
X_train['conversion'].to_numpy()

array([1, 0, 0, ..., 0, 0, 0])

In [13]:
#compute train and val weights
X_train_weights = compute_class_weight('balanced',y = X_train['conversion'],classes=np.array([0,1]))
X_train_weights = torch.from_numpy(X_train_weights).to(dtype = torch.float32)
print(X_train_weights)

X_test_weights = compute_class_weight('balanced',y = X_test['conversion'],classes=np.array([0,1]))
X_test_weights = torch.from_numpy(X_test_weights).to(dtype = torch.float32)
print(X_test_weights)

tensor([0.6283, 2.4487])
tensor([0.6283, 2.4487])


In [14]:
X_train,X_val = train_test_split(X_train,train_size = 0.9,stratify = X_train['conversion'])
print(len(X_train),len(X_val))

445500 49500


In [15]:
#just to be sure
X_val_weights = compute_class_weight('balanced',y = X_val['conversion'],classes=np.array([0,1]))
X_val_weights = torch.from_numpy(X_val_weights).to(dtype = torch.float32)
print(X_val_weights)

tensor([0.6283, 2.4488])


In [16]:
print(len(X_test))

105000


In [17]:
class UpliftDataset(Dataset):
    def __init__(self,dataset_itself):
        self.data = dataset_itself
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,indx):
        df_row = self.data.iloc[indx,:]
        treatment = (df_row.iloc[0] == 'treatment')
        df_row.iloc[0] = 1 if treatment else 0
        X,y = df_row.iloc[:-1],df_row.iloc[-1]
        X,y = torch.from_numpy(X.to_numpy(dtype = np.float32)),torch.tensor(int(y))
        return X,y

In [None]:
# X_train_dataset = UpliftDataset(X_train)
# X_test_dataset = UpliftDataset(X_test)
# X_val_dataset = UpliftDataset(X_val)

In [None]:
# torch.save(X_train_dataset,'/home/luchian/prog/uni_prog/uni_data/X_train_uplift.pth')
# torch.save(X_test_dataset,'/home/luchian/prog/uni_prog/uni_data/X_test_uplift.pth')
# torch.save(X_val_dataset,'/home/luchian/prog/uni_prog/uni_data/X_val_uplift.pth')

In [9]:
X_train_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_train_uplift.pth')
X_test_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_test_uplift.pth')
X_val_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_val_uplift.pth')

  X_train_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_train_uplift.pth')
  X_test_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_test_uplift.pth')
  X_val_dataset = torch.load('/home/luchian/prog/uni_prog/uni_data/X_val_uplift.pth')


In [18]:
print(len(X_train_dataset) + len(X_test_dataset) + len(X_val_dataset))

600000


In [250]:
#the S model
class SModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.rel = nn.LeakyReLU(0.05)
        self.sigm = nn.Sigmoid()
        self.drop = nn.Dropout1d(p = 0.15)

        self.norm1 = nn.BatchNorm1d(128)
        self.norm2 = nn.BatchNorm1d(128)
        self.norm3 = nn.BatchNorm1d(128)
        self.norm4 = nn.BatchNorm1d(1)

        self.lin1 = nn.Linear(51,128)
        self.lin2 = nn.Linear(128,128)
        self.lin3 = nn.Linear(128,128)
        self.lin4 = nn.Linear(128,1)
    
    def forward(self,X):
        y = self.rel(self.lin1(X))
        y = self.norm1(y)
        y = self.drop(y)

        y = self.rel(self.lin2(y))
        y = self.norm2(y)
        y = self.drop(y)

        y = self.rel(self.lin3(y))
        y = self.norm3(y)
        y = self.drop(y)
     
        y = self.lin4(y)
        y = self.norm4(y)
        y = self.sigm(y)
        return y

In [251]:
def get_accuracy(model,some_dataset,dev = 'cuda'):
    model.eval()
    N = len(some_dataset)
    main_list = []
    main_loader = DataLoader(dataset = some_dataset,shuffle = True,batch_size=1)
    for X,y in main_loader:
        X,y = X.to(device = dev),y.to(device = dev)
        y_pred = model(X)
        label = 1 if y_pred > 0.5 else 0
        if label == y[0]:
            main_list.append(1)
    return round(sum(main_list)/N,5)

In [252]:
@torch.no_grad()
def dataset_loss(model,dev,dataset,criterion,target_weights,bat_size):
    """calculates the loss on the dataset given the model and loss"""
    model.eval()

    val_loss_over_batches = []

    main_loader = DataLoader(dataset = dataset,shuffle = True,batch_size = bat_size)
    for X,y in main_loader:
        X,y = X.to(device = dev),y.reshape(y.shape[0],1).to(device = dev,dtype = torch.float32)
        #some computation for weights
        yy = torch.clone(y)
        yy[yy == 0] = target_weights[0]
        yy[yy == 1] = target_weights[1]
        main_weights = yy.reshape(-1)
        #creating the loss function
        loss = criterion(reduction = 'sum',weight = main_weights)
        #calculating the loss
        y_pred = model(X)
        the_loss = loss(y_pred.T,y.T)
        val_loss_over_batches.append(round(the_loss.item(),5))
    return sum(val_loss_over_batches)/len(main_loader)


In [502]:
def train_model(
        model,dev,Train_dataset,Val_dataset,optim,optim_params,criterion,target_weights,bat_size,epoch,pre_optim = False
):
    """Train and model given the parameters"""
    model.train()

    train_loader = DataLoader(dataset = Train_dataset,shuffle=True,batch_size = bat_size)

    optimizer = optim(model.parameters(),**optim_params)
    #if the optimizer is pre-dertermined
    if pre_optim:
        optim(model.parameters(),**optim_params).load_state_dict(pre_optim.state_dict())

    train_losses_over_batches = []
    val_accs = [-float('inf')]

    for ep in range(epoch):
        model.train()
        for X,y in train_loader:
            optimizer.zero_grad()
            X,y = X.to(device = dev),y.reshape(y.shape[0],1).to(device = dev,dtype = torch.float32)
            #some computation for weights
            yy = torch.clone(y)
            yy[yy == 0] = target_weights[0]
            yy[yy == 1] = target_weights[1]
            main_weights = yy.reshape(-1)
            #for each such weight we create separte loss function
            loss = criterion(reduction = 'sum',weight = main_weights) 
            #prediction and backprop
            y_pred = model(X)
            the_loss = loss(y_pred.T,y.T)
            the_loss.backward()
            optimizer.step()
            #saving the losses 
            train_losses_over_batches.append(round(the_loss.item(),5))
        #every 3-rd epoch pring results 
        #stop iteration if the error on val score has increased
        current_val_acc = get_accuracy(model,Val_dataset,dev = 'cuda')
        val_accs.append(current_val_acc)
        print(f'Epoch #{ep+1} | Train Loss: {train_losses_over_batches[-1]} | Val accuracy: {val_accs[-1]}')
        if val_accs[-1] < val_accs[-2]:
            print('\n\nThe accuracy on Val dataset has decreased. Stopping training iterations....')
            break
    return train_losses_over_batches,val_accs,optimizer

In [498]:
smodel = SModel().to(device = 'cuda')

In [None]:
res1 = train_model(model = smodel,
                   dev = 'cuda',
                   Train_dataset = X_train_dataset,Val_dataset = X_val_dataset,
                   optim = torch.optim.Adam,
                   optim_params={'betas': (0.9,0.98),'lr': 0.08},
                   criterion=nn.BCELoss,
                   target_weights = X_train_weights,
                   bat_size = 64,
                   epoch = 3000
                   )

Epoch #1 | Train Loss: 43.67865 | Val accuracy: 0.83285
Epoch #2 | Train Loss: 40.45711 | Val accuracy: 0.83293
Epoch #3 | Train Loss: 42.93871 | Val accuracy: 0.8381
Epoch #4 | Train Loss: 40.36104 | Val accuracy: 0.82469


The accuracy on Val dataset has decreased. Stopping training iterations....


In [509]:
res2 = train_model(model = smodel,
                   dev = 'cuda',
                   Train_dataset = X_train_dataset,Val_dataset = X_val_dataset,
                   optim = torch.optim.Adam,
                   optim_params={'betas': (0.9,0.98),'lr': 0.005},
                   criterion=nn.BCELoss,
                   target_weights = X_train_weights,
                   bat_size = 64,
                   epoch = 3000,
                   pre_optim=False
                   )

Epoch #1 | Train Loss: 26.67966 | Val accuracy: 0.83545
Epoch #2 | Train Loss: 39.51958 | Val accuracy: 0.83273


The accuracy on Val dataset has decreased. Stopping training iterations....


In [None]:
#MODEL1
# torch.save(smodel.state_dict(),'/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_1.pth')
# smodel.load_state_dict(torch.load('/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_1.pth',weights_only = True))

In [None]:
#MODEL2
# torch.save(smodel.state_dict(),'/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_2.pth')
# smodel.load_state_dict(torch.load('/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_2.pth',weights_only = True))

<All keys matched successfully>

In [None]:
#MODEL3
# torch.save(smodel.state_dict(),'/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_3.pth')
# smodel.load_state_dict(torch.load('/home/luchian/prog/uni_prog/uni_data/MyModels/S_model_3.pth',weights_only = True))

In [514]:
def get_uplift_metrics(y_true,uplift,treatment):
    upliftk_group = lift_metrics.uplift_at_k(y_true = y_true,uplift = uplift,treatment = treatment,strategy = 'by_group',k = 0.35)
    upliftk_overall = lift_metrics.uplift_at_k(y_true = y_true,uplift = uplift,treatment = treatment,strategy = 'overall',k = 0.35)

    qini_auc = lift_metrics.qini_auc_score(y_true = y_true,uplift = uplift,treatment = treatment)

    uplift_auc = lift_metrics.uplift_auc_score(y_true = y_true,uplift = uplift,treatment = treatment)

    #weighted average
    wau = lift_metrics.weighted_average_uplift(y_true = y_true,uplift = uplift,treatment = treatment,strategy = 'by_group')
    wau_all = lift_metrics.weighted_average_uplift(y_true = y_true,uplift = uplift,treatment = treatment,strategy = 'by_group')

    print(f'Uplift at top 30% by group: {upliftk_group:.3f} | Uplift at top 30% by overall: {upliftk_overall:.3f}')
    print(f'Weighted average uplift by group: {wau:.3f} | Weighted average uplift by overall: {wau_all:.3f}  ')
    print(f'AUUC by group: {uplift_auc:.3f}')
    print(f'AUQC by group: {qini_auc:.3f}')

    metric_dict = {'uplift@k_group':upliftk_group,'uplift@k_overall':upliftk_overall,
                   'qini_auc':qini_auc,'uplift_auc':uplift_auc,'WAU':wau,'WAU_all':wau_all}


    

In [411]:
class TestUpliftDataset(Dataset):
    def __init__(self,dataset_itself):
        self.data = dataset_itself
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,indx):
        df_row = self.data.iloc[indx,:]
        treatment = (df_row.iloc[0] == 'treatment')
        df_row.iloc[0] = 1 if treatment else 0
        X,y = df_row.iloc[:-1],df_row.iloc[-1]
        X,y = torch.from_numpy(X.to_numpy(dtype = np.float32)),torch.tensor(int(y))
        return X[1:],X[0],y.to(dtype = torch.float32) #X,treatment,y

In [412]:
dataset = TestUpliftDataset(X_test)

In [413]:
dataset[0][0].shape

torch.Size([50])

In [457]:
def get_up_t_y(model,test_dataset):
    model.eval()
    main_dataset = test_dataset
    main_dict = {'uplift':[],'treatment':[],'y_true':[]}
    for X,t,y in main_dataset:
        X = X.to(device = 'cuda')
        w1 = torch.tensor([1],dtype = torch.float32).to(device = 'cuda')
        w0 = torch.tensor([0],dtype = torch.float32).to(device = 'cuda')
        X1 = torch.cat([w1,X],axis = 0).reshape(1,X.shape[0]+1)
        X2 = torch.cat([w0,X],axis = 0).reshape(1,X.shape[0]+1)
        y_pred_1 = model(X1).cpu()
        y_pred_0 = model(X2).cpu()
        #getting_values
        uplift = (y_pred_1-y_pred_0).item()
        t = t.item()
        y = y.item()
        #inserting in dictionary
        main_dict['uplift'].append(uplift)
        main_dict['treatment'].append(t)
        main_dict['y_true'].append(y)
    return main_dict

In [510]:
s_dict = get_up_t_y(smodel,dataset)

In [515]:
get_uplift_metrics(y_true = np.array(s_dict['y_true']),
                         uplift = np.array(s_dict['uplift']),
                         treatment = np.array(s_dict['treatment']))

Uplift at top 30% by group: 0.190 | Uplift at top 30% by overall: 0.190
Weighted average uplift by group: 0.046 | Weighted average uplift by overall: 0.046  
AUUC by group: 0.149
AUQC by group: 0.215
