In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import os

DIR = 'drive/MyDrive/sber'

TRAIN_VAL2_PATH = os.path.join(DIR, 'train_val2.parquet') 
VAL2_PATH = os.path.join(DIR, 'val2.parquet')  

TRAIN_VAL1_PATH = os.path.join(DIR, 'train_val1.parquet') 
VAL1_PATH = os.path.join(DIR, 'val1.parquet')  
RECS_NN_VAL1_PATH = os.path.join(DIR, 'recs_nn_val1.parquet') 

USER_DECODER_PATH = os.path.join(DIR, 'user_decoder.pkl') 
NN_MODEL_PATH = os.path.join(DIR, 'nn_model.pkl')

NUM_CLUSTERS = 8000
NUM_USERS = 1595239
NUM_RETAILERS = 118
NUM_CITIES = 148

In [None]:
from typing import Iterable, List

import pickle
import torch
import numpy as np
import pandas as pd
import scipy.sparse as sp

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [None]:
def create_sparse_matrix(short_train, col, num_classes, use_ones=False):
  df = short_train[['user_id', col]].drop_duplicates()
  df[f'user_{col}'] = df['user_id'].astype(np.int64) * 10000 + df[col]
  df['user_col_count'] = df[f'user_{col}'].map(short_train[f'user_{col}'].value_counts()) 
  df['user_count'] = df['user_id'].map(short_train['user_id'].value_counts()) 
  df['user_col_share'] = df['user_col_count'] / df['user_count']
  if use_ones:
    return sp.csr_matrix((np.ones(len(df)), (df['user_id'], df[col])), shape=(NUM_USERS, num_classes))
  return sp.csr_matrix((df['user_col_share'], (df['user_id'], df[col])), shape=(NUM_USERS, num_classes))

def create_x_y(train_val, val=None):
  short_train = train_val[~train_val[['order_id', 'cluster_id']].duplicated()]
  short_train['user_retailer_id'] = short_train['user_id'].astype(np.int64) * 10000 + short_train['retailer_id']
  short_train['user_city_id'] = short_train['user_id'].astype(np.int64) * 10000 + short_train['city_id']
  short_train['user_cluster_id'] = short_train['user_id'].astype(np.int64) * 10000 + short_train['cluster_id']

  x1 = create_sparse_matrix(short_train, 'retailer_id', NUM_RETAILERS)
  x2 = create_sparse_matrix(short_train, 'city_id', NUM_CITIES)
  x3 = create_sparse_matrix(short_train, 'cluster_id', NUM_CLUSTERS)
  x4 = create_sparse_matrix(short_train, 'cluster_id', NUM_CLUSTERS, True)

  x = sp.hstack([x1, x2, x3, x4], format='csr')
  if val is not None:
    y = sp.csr_matrix((np.ones(len(val)), [val['user_id'], val['cluster_id']]), shape=(NUM_USERS, NUM_CLUSTERS))
    return x, y
  else:
    return x, None


class Dataset:
    
    def __init__(self, x, y, users, batch_size, device='cuda'):
        
        self.batch_size = batch_size
        self.device = device
        self.x = x
        self.y = y
        self.users = users
        self.num_users = len(users)
        self.num_batches = int((self.num_users - 1) / batch_size + 1)
        
    def __getitem__(self, batch_num):
        
        i = batch_num * self.batch_size
        size = min(self.num_users - i, self.batch_size)
        users = self.users[i: i + size]
        if y is not None:
          return (torch.FloatTensor(self.x[users].todense()).to(self.device), 
                  torch.FloatTensor(self.y[users].todense()).to(self.device))
        else:
          return torch.FloatTensor(self.x[users].todense()).to(self.device), None
            
    def __iter__(self):
        np.random.shuffle(self.users)
        for i in range(self.num_batches):
            yield self[i]

    def __len__(self):
        return self.num_batches


class Model(torch.nn.Module):
    
    def __init__(self, device='cpu'):
        super().__init__()
        self.device = device
        dim = 2 * NUM_CLUSTERS + NUM_RETAILERS + NUM_CITIES
        self.linear = torch.nn.Linear(dim, 10000).to(self.device)
        self.linear2 = torch.nn.Linear(10000, NUM_CLUSTERS).to(self.device)
        torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()
    
    def forward(self, x):
        return self.sigmoid(self.linear2(self.relu(self.linear(x))))


def fit_model(model, dataset):
  optimizer = torch.optim.Adagrad(model.parameters(), lr=config['lr'])
  loss_function = torch.nn.BCELoss()
  for epoch in range(config['epoch']):
      for x, y in dataset:
          optimizer.zero_grad()
          score = model(x)
          loss = loss_function(score, y)
          loss.backward()
          optimizer.step()


def get_rec(model, dataset, topk=160):
  items = []
  scores = []
  losses = []
  with torch.no_grad():
    for x, y in dataset:
      score = model(x)
      recom = torch.topk(score, topk)
      items.append(recom[1].flatten().cpu().detach().numpy().astype(np.int16))
      scores.append(recom[0].flatten().cpu().detach().numpy())

  users = dataset.users.reshape(-1, 1).repeat(topk, 1).flatten()
  items = np.hstack(items)
  scores = np.hstack(scores)

  recs = pd.DataFrame()
  recs['user_id'] = users
  recs['cluster_id'] = items
  recs['scores'] = scores
  return recs

In [None]:
config = {
    'batch_size': 3000,
    'device': 'cuda',
    'lr': 0.01,
    'epoch': 10,
}

In [None]:
train_val2 =  pd.read_parquet(TRAIN_VAL2_PATH)
val2 = pd.read_parquet(VAL2_PATH)
x, y = create_x_y(train_val2, val2)
dataset = Dataset(x, y, val2['user_id'].unique(), config['batch_size'], config['device'])
model = Model(config['device'])
fit_model(model, dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [None]:
train_val1 =  pd.read_parquet(TRAIN_VAL1_PATH)
val1 = pd.read_parquet(VAL1_PATH)
x, y = create_x_y(train_val1, val1)
dataset = Dataset(x, y, val1['user_id'].unique(), config['batch_size'], config['device'])
recs = get_rec(model, dataset)
recs.to_parquet(RECS_NN_VAL1_PATH)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [None]:
model = Model(config['device'])
fit_model(model, dataset)
model.cpu()
model.device = 'cpu'
pickle.dump(model, open(NN_MODEL_PATH, 'wb'))