In [2]:
import os
import torch as th
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from collections import *
import numba

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# @numba.jit(parallel=True)
def merge_all_scores(predictions: list, ground_truth=None):
    N = predictions[0].shape[0]
    all_scores = []
    all_labels = []
    all_cands = []
    for i in range(N):
        scores = []
        labels = []
        truth = ground_truth.iloc[i] if ground_truth is not None else None
        id2score = [Counter(dict(zip(p['next_item_prediction'][i], p['scores'][i]))) for p in predictions]
        all_candidates = np.concatenate([p['next_item_prediction'][i] for p in predictions])
        for cand in all_candidates:
            scores.append([d[cand] for d in id2score])
            if ground_truth is not None:
                if cand == truth:
                    labels.append(1)
                else:
                    labels.append(0)
        all_scores.append(scores)
        all_labels.append(labels)
        all_cands.append(all_candidates)
    return all_scores, all_labels, all_cands

In [3]:
def normalization(data):
    print("Max=", np.max(data, axis=0), " Min=", np.min(data, axis=0))
    _range = np.max(data, axis=0) - np.min(data, axis=0)
    return (data - np.min(data, axis=0)) / _range

def standardization(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    print("Mean={}, Sigma={}".format(mu, sigma))
    return (data - mu) / sigma

def softmax(data, axis):
    _exp = np.exp(data)
    return data / _exp.sum(axis=axis, keepdims=True)

In [4]:
valid_data = pd.read_csv("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/all_task_1_valid_sessions.csv", sep=",")
roberta_pred = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/roberta/roberta_valid_150_with_score.parquet", engine='pyarrow')
sasrec_pred = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/SASRec_Next/SASRec_valid_150_with_score.parquet", engine='pyarrow')
graph_pred = pd.read_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/co-occurrence_graph/graph_valid_150_with_score.parquet", engine='pyarrow')

In [5]:
roberta_pred

Unnamed: 0,next_item_prediction,scores
0,"[B084ZPL21L, B0B2JT2BND, B0B2JSKN7X, B08F7P8R6...","[266.0794982910156, 265.99432373046875, 265.99..."
1,"[B09WM9W6WQ, B09LCPT9DQ, B09MRYK5CV, B09LCRNQT...","[268.8392639160156, 268.33990478515625, 268.14..."
2,"[B06ZYHMRST, B007CW985C, B0017RTPK6, B002E2N5M...","[264.55255126953125, 264.0582580566406, 264.01..."
3,"[B08DP195R4, B01M5EBAO4, B09BNHWWZM, B0BJ54PZV...","[264.06292724609375, 263.95538330078125, 263.7..."
4,"[B0B6NY5RM8, B0B6PF619D, B09BJGBBBR, B0B6NYJRW...","[267.2440185546875, 267.2406921386719, 267.028..."
...,...,...
361576,"[B08HH6L4PB, B08L8N8HDR, B079FV57RR, B085WCD1C...","[266.1954040527344, 266.1148986816406, 264.926..."
361577,"[B09D76FT9D, B09D71VQF2, B09XH1YGLL, B09XGMMW6...","[267.57916259765625, 267.4937744140625, 267.48..."
361578,"[B0B6D8KGPC, B0BC2K5CY1, B0BC38GHB4, B09SXRRTZ...","[264.9151306152344, 264.9008483886719, 264.841..."
361579,"[B08T5Q4JGS, B08M9CDNSH, B0BFDZTJMF, B08RQDVX7...","[265.9405517578125, 265.3836669921875, 265.371..."


In [6]:
common_rate = {'bert-sasrec':[], 'sasrec-graph':[], 'bert-graph':[], 'all':[]}

for i in trange(valid_data.shape[0]):
    roberta_rec = set(roberta_pred.iloc[i]['next_item_prediction'])
    sasrec_rec = set(sasrec_pred.iloc[i]['next_item_prediction'])
    graph_rec = set(graph_pred.iloc[i]['next_item_prediction'])
    common = roberta_rec.intersection(sasrec_rec)
    common_rate['bert-sasrec'].append(len(common)/len(sasrec_rec))
    common = graph_rec.intersection(sasrec_rec)
    common_rate['sasrec-graph'].append(len(common)/len(sasrec_rec))
    common = roberta_rec.intersection(graph_rec)
    common_rate['bert-graph'].append(len(common)/len(sasrec_rec))
    common = roberta_rec.intersection(sasrec_rec.intersection(graph_rec))
    common_rate['all'].append(len(common)/len(sasrec_rec))

100%|██████████| 361581/361581 [01:02<00:00, 5788.00it/s]


In [30]:
rate = {}
for k, v in common_rate.items():
    rate[k] = np.mean(v)
print(rate)

{'bert-sasrec': 0.26116807391243824, 'sasrec-graph': 0.11254824414630928, 'bert-graph': 0.0831898062490378, 'all': 0.06386674078560546}


In [7]:
predictions = [sasrec_pred, roberta_pred, graph_pred]
all_scores, all_labels, _ = merge_all_scores(predictions, valid_data['next_item'])
all_scores = np.array(all_scores)
all_labels = np.array(all_labels)

In [10]:
all_scores[:,:, -1] = np.log(all_scores[:,:, -1]+1)

In [52]:
all_probs = softmax(all_scores, axis=1)

  _exp = np.exp(data)


In [None]:
all_candidates_valid = pd.DataFrame({'locale': valid_data['locale'], 'candidates': _})

In [None]:
all_candidates_valid.to_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/ensemble/valid_session_candidates_450.parquet", engine='pyarrow')

In [3]:
np.save('scores_450.npy', all_scores)
np.save('labels_450.npy', all_labels)

NameError: name 'all_scores' is not defined

In [5]:
all_scores = np.load('/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/ensemble/scores_450.npy')

In [11]:
type(all_scores[0][0][1])

numpy.float64

In [61]:
scores_mean = all_scores.mean(axis=0).mean(axis=0)
print(scores_mean)

[  5.74415528 122.87012845   0.31664055]


In [62]:
flat_score = all_scores.reshape(-1, 3)
flat_score = normalization(flat_score)
norm_scores = flat_score.reshape(all_scores.shape)

Max= [ 41.49373245 272.68444824   7.85360481]  Min= [-3.82411766  0.          0.        ]


In [22]:
print(norm_scores[:,:,0], "\n", norm_scores[:,:,1], "\n", norm_scores[:,:,2])

[[0.63509469 0.62448067 0.62422734 ... 0.08438436 0.42763748 0.42298822]
 [0.57159204 0.4863681  0.47803585 ... 0.34399732 0.08438436 0.08438436]
 [0.55927866 0.52617882 0.52049232 ... 0.08438436 0.08438436 0.08438436]
 ...
 [0.56258023 0.54705211 0.51480086 ... 0.08438436 0.08438436 0.08438436]
 [0.59835961 0.58720774 0.5590236  ... 0.08438436 0.08438436 0.08438436]
 [0.54689293 0.54322494 0.54262573 ... 0.08438436 0.08438436 0.08438436]] 
 [[0.97363901 0.97363901 0.         ... 0.         0.         0.        ]
 [0.98406751 0.98336591 0.9753296  ... 0.97524891 0.         0.        ]
 [0.         0.9664743  0.         ... 0.         0.         0.        ]
 ...
 [0.97123708 0.         0.96223483 ... 0.         0.         0.        ]
 [0.97220907 0.97299494 0.97220123 ... 0.         0.         0.        ]
 [0.97147143 0.         0.         ... 0.         0.         0.        ]] 
 [[0.48184111 0.46317408 0.39924268 ... 0.08825847 0.08825847 0.08825847]
 [0.79232103 0.50311211 0.38765924 

In [63]:
class ScoreDataset(th.utils.data.Dataset):
    def __init__(self, scores, labels):
        self.scores = scores
        self.labels = labels
        print(scores.shape, labels.shape)

    def __getitem__(self, index):
        return {'scores': self.scores[index], 'labels': self.labels[index]}

    def __len__(self):
        return self.scores.shape[0]

In [64]:
class EmsembleWeight(th.nn.Module):
    def __init__(self, n_models):
        super().__init__()
        self.weights = th.nn.Parameter(th.ones(n_models, dtype=th.double), requires_grad=True)

    def forward(self, scores):
        w = th.softmax(self.weights, dim=-1)
        weighted_score = scores @ w.view(-1,1)
        return weighted_score.squeeze()

    def cal_loss(self, batch):
        score = self.forward(batch['scores'])
        pos_mask = batch['labels'].nonzero()
        pos_score = score[pos_mask[:, 0], pos_mask[:, 1]]
        score_sum = th.logsumexp(score, dim=-1)[pos_mask[:, 0]]
        loss = (- pos_score + score_sum).mean()
        return loss

In [65]:
dataset = ScoreDataset(norm_scores[:,:,[0,2]], all_labels)
loader = th.utils.data.DataLoader(dataset, batch_size=1024, shuffle=True)

(361581, 450, 2) (361581, 450)


In [66]:
model = EmsembleWeight(2)
model = model.cuda()
optimizer = th.optim.Adam(model.parameters(), lr=0.001)

In [67]:
epochs = 50

In [68]:
model.train()
for e in range(epochs):
    e_loss = 0
    step = 0
    for batch in loader:
        optimizer.zero_grad()
        batch = {k: v.cuda() for k,v in batch.items()}
        loss = model.cal_loss(batch)
        loss.backward()
        optimizer.step()

        e_loss += loss.data
        step += 1
    print("Epoch: {}: loss: {:.5f}".format(e, e_loss / step))
    print("Current weight: {}".format(model.weights.data))

Epoch: 0: loss: 5.91446
Current weight: tensor([0.6708, 1.3292], device='cuda:0', dtype=torch.float64)
Epoch: 1: loss: 5.91163
Current weight: tensor([0.3921, 1.6079], device='cuda:0', dtype=torch.float64)
Epoch: 2: loss: 5.90994
Current weight: tensor([0.1621, 1.8379], device='cuda:0', dtype=torch.float64)
Epoch: 3: loss: 5.90893
Current weight: tensor([-0.0311,  2.0311], device='cuda:0', dtype=torch.float64)
Epoch: 4: loss: 5.90833
Current weight: tensor([-0.1986,  2.1986], device='cuda:0', dtype=torch.float64)
Epoch: 5: loss: 5.90788
Current weight: tensor([-0.3462,  2.3462], device='cuda:0', dtype=torch.float64)
Epoch: 6: loss: 5.90770
Current weight: tensor([-0.4809,  2.4809], device='cuda:0', dtype=torch.float64)
Epoch: 7: loss: 5.90749
Current weight: tensor([-0.6035,  2.6035], device='cuda:0', dtype=torch.float64)
Epoch: 8: loss: 5.90738
Current weight: tensor([-0.7196,  2.7196], device='cuda:0', dtype=torch.float64)
Epoch: 9: loss: 5.90727
Current weight: tensor([-0.8293,  2.8

In [69]:
th.softmax(model.weights.data, dim=-1)

tensor([2.5876e-05, 9.9997e-01], device='cuda:0', dtype=torch.float64)

In [48]:
test_data = pd.read_csv("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv", sep=",")
roberta_pred_tst = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/roberta/roberta_test_150_with_score.parquet", engine='pyarrow')
sasrec_pred_tst = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/SASRec_Next/SASRec_test_150_with_score.parquet", engine='pyarrow')
graph_pred_tst = pd.read_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/co-occurrence_graph/graph_test_150_with_score.parquet", engine='pyarrow')

In [49]:
predictions_tst = [sasrec_pred_tst, roberta_pred_tst, graph_pred_tst]
all_scores_tst, _, all_cands_tst = merge_all_scores(predictions_tst)
all_scores_tst = np.array(all_scores_tst)
cands = np.array(all_cands_tst)

In [56]:
all_scores_tst[:, :, -1] = np.log(all_scores_tst[:,:, -1]+1)

In [57]:
flat_score_tst = all_scores_tst.reshape(-1, 3)
flat_score_tst = normalization(flat_score_tst)
norm_scores_tst = flat_score.reshape(all_scores_tst.shape)

Max= [ 40.96121979 273.03417969   2.16220585]  Min= [-3.29537606  0.          0.        ]


In [51]:
cands.shape == all_scores_tst.shape[:-1]

True

In [52]:
weight = th.softmax(model.weights.data, dim=-1).cpu().numpy()

In [55]:
scores = norm_scores[:, :, [0,2]] @ weight

In [None]:
scores.shape

(316971, 450)

In [None]:
idx = np.argsort(-scores, axis=1)[:,:100]

In [None]:
final_res = []

for i, c in enumerate(all_cands_tst):
    final_res.append(c[idx[i]].tolist())

In [None]:
df = pd.DataFrame({'locale': test_data['locale'], 'next_item_prediction': final_res})
df['next_item_prediction'].apply(len).describe()

count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64

In [None]:
df.to_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/ensemble/predictions.parquet", engine='pyarrow')

In [25]:
df

Unnamed: 0,locale,next_item_prediction
0,DE,"[B0BFBQD7BB, B0BFBQD7BB, B0B7S7LBMB, B0B7S7LBM..."
1,DE,"[B01IT50MC8, B09WD6NB21, B00CLN9TYM, B07KWB5C6..."
2,DE,"[B0BF99DXCY, B0044MK532, B0044MK532, B0044MK53..."
3,DE,"[B08M2FXZWR, B0BJYJQS7T, B0BJYJQS7T, B0BJYJQS7..."
4,DE,"[B09L8GQGTY, B09P33NXS5, B09P33NXS5, B096DGXNZ..."
...,...,...
316966,UK,"[B0B5QT4SFX, B0084M66PY, B09BZCDZTK, B083476WT..."
316967,UK,"[B08KGG48WV, B08KDK6FS1, B08KDK6FS1, B08MBP7G5..."
316968,UK,"[B0002EXYZ2, B0002EXYZ2, B09MTQSLVC, B09MTQSLV..."
316969,UK,"[B09C8RQ8NT, B004LR92MK, B09G6M255X, B09G6KJ62..."


In [82]:
valid_data = pd.read_csv("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/all_task_1_valid_sessions.csv", sep=",")
roberta_pred_val = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/roberta/roberta_valid_150_with_score.parquet", engine='pyarrow')
sasrec_pred_val = pd.read_parquet("/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/SASRec_Next/SASRec_valid_150_with_score.parquet", engine='pyarrow')
graph_pred_val = pd.read_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/co-occurrence_graph/graph_valid_150_with_score.parquet", engine='pyarrow')

In [89]:
valid_data.shape, roberta_pred_val.shape

((361581, 3), (361581, 2))

In [83]:
predictions_val = [sasrec_pred_val, roberta_pred_val, graph_pred_val]
all_scores_val, _, all_cands_val = merge_all_scores(predictions_val)
all_scores_val = np.array(all_scores_val)
cands = np.array(all_cands_val)

In [84]:
flat_score = all_scores_val.reshape(-1, 3)
flat_score = normalization(flat_score)
norm_scores = flat_score.reshape(all_scores_val.shape)
scores = norm_scores @ weight

Max= [  41.49373245  272.68444824 8210.        ]  Min= [-3.82411766  0.          0.        ]


In [94]:
idx = np.argsort(-scores, axis=1)[:,:100]

final_res = []

for i, c in enumerate(all_cands_val):
    final_res.append(c[idx[i]].tolist())

In [96]:
df_val = pd.DataFrame({'locale': valid_data['locale'], 'next_item_prediction': final_res})
df_val['next_item_prediction'].apply(len).describe()

count    361581.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64

In [97]:
df_val.to_parquet("/root/autodl-tmp/huangxu/Amazon-KDDCUP-23/ensemble/predictions_val.parquet", engine='pyarrow')

In [93]:
len(final_res)

316971