In [24]:
import os

from imblearn.over_sampling import RandomOverSampler
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import pandas as pd
from datetime import datetime, timedelta
from torch.utils.data import DataLoader, random_split, TensorDataset
from torchvision.datasets import MNIST

device = "cuda" if torch.cuda.is_available() else "cpu"

In [25]:
def most_similar(pred, customers, n):
    n_cust = customers.shape[0]
    cust_sims = np.empty(n_cust)
    
    for i in range(n_cust):
        cosine_sim = dot(pred, customers.values[i])/(norm(pred)*norm(customers.values[i]))
        cust_sims[i] = cosine_sim
        
        
    return cust_sims

In [26]:
class Generator(nn.Module):
    def __init__(self, product_dims, customer_dims):
        super().__init__()
        
        leakyReLU_slope = 0.1
        dropout = 0.1

        self.model = nn.Sequential(
            nn.Linear(product_dims, 64),
            nn.LeakyReLU(leakyReLU_slope),
            nn.Dropout(dropout),
            nn.Linear(64, 64),
            nn.LeakyReLU(leakyReLU_slope),
            nn.Dropout(dropout),
            nn.Linear(64, 64),
            nn.LeakyReLU(leakyReLU_slope),
            nn.Linear(64, customer_dims),
            nn.Sigmoid()
        )

    def forward(self, product_vector):
        return self.model(product_vector)

In [27]:
class Discriminator(nn.Module):
    def __init__(self, customer_dims):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(customer_dims, 64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(64, 64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(64, 32),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(32, 32),
            nn.Linear(32, 1),
            nn.Sigmoid(),
        )

    def forward(self, customer_vector):
        return self.model(customer_vector)

In [6]:
# Import data
transactions = pd.read_csv('data/transactions_1.csv')[['PRODUCT_ID', 'CUSTOMER_ID']]
products = pd.read_csv('data/products.csv')
customers = pd.read_csv('data/customers.csv')

In [7]:
# Products data
prods_final = pd.concat([
    pd.get_dummies(products['CATEGORY'])
], axis=1)
prods_final['price'] = products['PRICE']/products['PRICE'].max()
prods_final['PRODUCT_ID'] = products['ID'].copy()

In [16]:
customers_final = pd.concat([
    pd.get_dummies(customers['GENDER'])[['Female', 'Male']],
    pd.get_dummies(customers['COUNTRY']),
    pd.get_dummies(customers['TYPE'])
], axis=1)
customers_final['age'] = ((datetime.today() - pd.to_datetime(customers['DOB']))/timedelta(days=365)).round()/100
customers_final['CUSTOMER_ID'] = customers['ID'].copy()

In [9]:
customers_final = transactions.merge(customers_final, how='left', on='CUSTOMER_ID') \
    .drop(columns=['CUSTOMER_ID']) \
    .groupby('PRODUCT_ID') \
    .mean() \
    .reset_index(drop=False)

merged = prods_final.merge(customers_final, on='PRODUCT_ID')
X = merged.iloc[:,0:len(prods_final.columns)].drop(columns='PRODUCT_ID').reset_index(drop=True)
y = merged.iloc[:,len(prods_final.columns):len(merged.columns)].reset_index(drop=True)

In [11]:
prods_final = transactions[0:100000].merge(prods_final, how='left', on='PRODUCT_ID') \
    .drop(columns=['PRODUCT_ID']) \
    .groupby('CUSTOMER_ID') \
    .mean() \
    .reset_index(drop=False)

In [19]:
merged = customers_final.merge(prods_final, on='CUSTOMER_ID')
X = merged.iloc[:,0:len(customers_final.columns)].drop(columns='CUSTOMER_ID').reset_index(drop=True)
y = merged.iloc[:,len(customers_final.columns):len(merged.columns)].reset_index(drop=True)

In [33]:
X_nans_mask = X.isna().all(axis=1)
X = X[~X_nans_mask]
y = y[~X_nans_mask]

y_nans_mask = y.isna().all(axis=1)
X = X[~y_nans_mask].reset_index(drop=True)
y = y[~y_nans_mask].reset_index(drop=True)

In [None]:
# Training data
# X = transactions.merge(prods_final, how='left', on='PRODUCT_ID').drop(columns=['PRODUCT_ID', 'CUSTOMER_ID'])
# y = transactions.merge(customers_final, how='left', on='CUSTOMER_ID').drop(columns=['PRODUCT_ID', 'CUSTOMER_ID'])

In [None]:
"""
X_nans_mask = X.isna().all(axis=1)
X = X[~X_nans_mask]
y = y[~X_nans_mask]

y_nans_mask = y.isna().all(axis=1)
X = X[~y_nans_mask].reset_index(drop=True)
y = y[~y_nans_mask].reset_index(drop=True)
"""

In [38]:
train_size = 1
batch_size = 64
lr = 0.00001
num_epochs = 100
product_dims = len(X.columns)
customer_dims = len(y.columns)
disc_input_dims = product_dims + customer_dims
#fixed_noise = torch.randn((batch_size, product_dims)).to(device)
#fixed_noise = input_data[200:205]
step = 0

disc = Discriminator(disc_input_dims).to(device)
gen = Generator(product_dims, customer_dims).to(device)
criterion = nn.BCELoss()
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)

train_inds = random.sample(list(X.index.values), int(train_size*len(X.index)))
input_data = torch.tensor(X.iloc[train_inds].values.astype(np.float32))
true_positive = torch.tensor(y.iloc[train_inds].values.astype(np.float32))
labels = torch.ones_like(input_data)
train_dataset = TensorDataset(
    input_data,
    true_positive
)
loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)

In [39]:
torch.set_printoptions(precision=3, sci_mode=False, linewidth=180, profile='full')

n_total_steps = len(loader)
for epoch in range(num_epochs):
    for batch_idx, (input_prod, true_positive) in enumerate(loader):
        input_prod = input_prod.to(device)
        batch_size = input_prod.shape[0]
        
        # Train Discriminator: max log(D(real)) + log(1 - D(G(z)))
        #noise = torch.randn(batch_size, product_dims)
        disc_input_tp = torch.cat((true_positive, input_prod), 1)
        disc_tp = disc(disc_input_tp).view(-1)
        lossD_tp = criterion(disc_tp, torch.ones_like(disc_tp))
        
        fake = gen(input_prod)
        disc_input_fake = torch.cat((fake, input_prod), 1)
        disc_fake = disc(disc_input_fake).view(-1)
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_tp + lossD_fake)/2
        opt_disc.zero_grad()
        lossD.backward(retain_graph=True)
        opt_disc.step()
        
        # Train Generator min log(1 - D(G(z))) -> max log(D(G(z)))
        output = disc(disc_input_fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))

        opt_gen.zero_grad()
        lossG.backward()
        opt_gen.step()
        
        if batch_idx == 0:# or ((batch_idx + 1) % 50 == 0):
            print(
                f"Epoch [{epoch}/{num_epochs}] \ "
                f"Step [{batch_idx + 1}/{n_total_steps}] \ "
                f"Loss D: {lossD:.3f}, Loss G: {lossG:.3f}"
            )
            
            if epoch % 5 == 0:
                with torch.no_grad():
                    
                    random_inds = random.sample(list(range(len(input_data))), 5)
                    sample_inputs = input_data[random_inds]
                    fake = gen(sample_inputs)
                    print(fake[0:5])

Epoch [0/100] \ Step [1/418] \ Loss D: 0.696, Loss G: 0.736
tensor([[0.488, 0.499, 0.544, 0.473, 0.493, 0.492, 0.537, 0.520, 0.496, 0.527, 0.470, 0.493, 0.464, 0.505, 0.539, 0.470, 0.524, 0.502, 0.522, 0.486, 0.476],
        [0.489, 0.496, 0.538, 0.476, 0.490, 0.493, 0.532, 0.516, 0.498, 0.522, 0.472, 0.493, 0.463, 0.504, 0.537, 0.475, 0.525, 0.492, 0.521, 0.484, 0.480],
        [0.490, 0.502, 0.543, 0.473, 0.494, 0.491, 0.535, 0.526, 0.488, 0.524, 0.468, 0.497, 0.467, 0.504, 0.538, 0.473, 0.521, 0.501, 0.519, 0.488, 0.477],
        [0.487, 0.499, 0.542, 0.473, 0.497, 0.486, 0.531, 0.522, 0.495, 0.521, 0.473, 0.492, 0.467, 0.502, 0.532, 0.466, 0.524, 0.499, 0.515, 0.477, 0.473],
        [0.490, 0.508, 0.545, 0.472, 0.500, 0.491, 0.530, 0.531, 0.491, 0.526, 0.471, 0.485, 0.471, 0.503, 0.537, 0.472, 0.521, 0.505, 0.512, 0.484, 0.470]])
Epoch [1/100] \ Step [1/418] \ Loss D: 0.683, Loss G: 0.746
Epoch [2/100] \ Step [1/418] \ Loss D: 0.636, Loss G: 0.820
Epoch [3/100] \ Step [1/418] \ Los

KeyboardInterrupt: 

In [40]:
with torch.no_grad():
    preds = gen(input_data).numpy()

In [41]:
preds

array([[1.13820108e-02, 1.44382866e-05, 9.90198896e-05, ...,
        8.21392860e-06, 3.06224247e-05, 7.97897637e-01],
       [1.32895112e-02, 2.21742102e-05, 1.37409268e-04, ...,
        1.20443901e-05, 4.46451704e-05, 7.92871714e-01],
       [1.48815271e-02, 1.72994933e-05, 1.29967608e-04, ...,
        1.20420473e-05, 4.87234538e-05, 7.74703920e-01],
       ...,
       [1.24373361e-02, 2.15632354e-05, 1.40539865e-04, ...,
        1.32176674e-05, 4.24373975e-05, 8.02903116e-01],
       [1.59358568e-02, 3.84141749e-05, 2.61922221e-04, ...,
        2.77221043e-05, 7.82632051e-05, 7.73363829e-01],
       [1.04056085e-02, 1.37029920e-05, 1.19411576e-04, ...,
        9.56533859e-06, 4.07898842e-05, 8.10565829e-01]], dtype=float32)

In [None]:
most_similar(preds[0], customers_final.drop(columns='CUSTOMER_ID'), 10)

In [2]:
customers_final.drop(columns='CUSTOMER_ID')

NameError: name 'customers_final' is not defined

In [50]:
#def most_similar(pred, customers):
    
similarities = cosine_similarity(
    preds,
    y
)

In [61]:
X

Unnamed: 0,Female,Male,Sweden,UK,US,Bargain hunter,Impulse,Lookers,Loyal,Need-based,New,age
0,False,True,True,False,False,False,False,False,False,True,False,0.27
1,False,True,False,False,True,False,False,False,False,True,False,0.55
2,True,False,False,True,False,False,False,False,True,False,False,0.55
3,True,False,False,False,True,False,False,False,False,True,False,0.32
4,False,True,False,False,True,False,False,False,False,False,True,0.22
...,...,...,...,...,...,...,...,...,...,...,...,...
26699,True,False,False,True,False,False,False,False,False,True,False,0.33
26700,True,False,False,False,True,False,False,False,False,True,False,0.28
26701,False,True,True,False,False,False,False,False,False,True,False,0.58
26702,True,False,True,False,False,False,False,False,False,False,True,0.21


In [1]:
similarities

NameError: name 'similarities' is not defined

In [64]:
positive_weights = {}
negative_weights = {}
for c in class_names:
    positive_weights[c] = train_df.shape[0]/(2*np.count_nonzero(train_df[c]==1))
    negative_weights[c] = train_df.shape[0]/(2*np.count_nonzero(train_df[c]==0))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
pd.get_dummies(products['CATEGORY']).sum()

In [None]:
X = pd.concat([
    pd.get_dummies(products['CATEGORY'])
], axis=1)
X['price'] = products['PRICE']/products['PRICE'].max()

In [83]:
true_positive.numpy().sum(axis=0)

array([135, 117, 88, 82, 86, 0, 21, 18, 65, 87, 65, 95.7], dtype=float32)

In [None]:
transactions