In [1]:
%%time
import pandas as pd
import os
import numpy as np
import torch
import torch_geometric
from sklearn.preprocessing import LabelEncoder
import gc
from torch_geometric.data import Data

# item_feature = pd.read_csv('../data/item_feature.csv')
# test = pd.read_csv('../data/test.csv')
# user_feature = pd.read_csv('../data/user_feature.csv')
train = pd.read_pickle('../data/sml_train.pkl')

Wall time: 6.3 s


In [2]:
train.head()

Unnamed: 0,user_id,item_id,behavior_type,date
0,1732029000.0,319336400.0,clk,2019-06-19
2,1732029000.0,1197152000.0,clk,2019-06-19
3,1732029000.0,1145630000.0,clk,2019-06-19
5,1732029000.0,1162473000.0,clk,2019-06-19
7,1732029000.0,1128524000.0,clk,2019-06-19


### baseline思路(时间序列思路，未解决推荐问题和冷启动问题)
1. 根据历史clk记录预测未来可能重复clk的user-item pair
2. 历史clk记录可以用user-item二部图来表示，user和item作为节点，其中的边作为clk记录
3. feature为user_embedding和item_embedding，通过concat两者的embedding后接MLP得到user-item pair的预测值
4. label为历史clk边中重复clk的边  

---
由于内存不足，事先在服务器筛了训练数据中18-20号的数据，用18-19号的clk预测20号会重复clk的。   

In [3]:
%%time
train = train[train.behavior_type=='clk']
his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])
now = train[train.date>='2019-06-20']
del train
train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')
del his,now
gc.collect()

Wall time: 54.3 s


42

## build model

In [4]:
%%time
train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)
# 显存不足，下采样
train['behavior_type'] = train['behavior_type'].fillna(0)
train['behavior_type'] = train['behavior_type'].map({'clk':1})

u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])
train['user_id'] = u_enc.transform(train['user_id'])
train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]

edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])
u = torch.tensor(train['user_id'].unique().reshape(-1,1))
i = torch.tensor(train['item_id'].unique().reshape(-1,1))
y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)
data = Data(u=u,i=i,edge_index=edge_index,y=y)

Wall time: 9.81 s


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

In [59]:
print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])

positive label ratio:  0.2816764522287305


In [60]:
data.contains_isolated_nodes()



False

In [69]:
data.num_nodes



2243363

In [68]:
data.num_edges

4176391

In [66]:
data.u.shape # number of user

torch.Size([776543, 1])

In [67]:
data.i.shape # number of item

torch.Size([1466820, 1])

In [70]:
data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse

3.6665601930725125e-06

In [6]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F


# hyper param
EMB_DIM = 10

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)
        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)
        self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)
        self.conv2 = GCNConv(EMB_DIM,8)
        self.lin = torch.nn.Linear(8,2)
        
    def forward(self, data):
        u,i,edge_index = data.u,data.i,data.edge_index
        
        emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)
        emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)
        
        x = torch.cat([emb_u,emb_i],dim=1)
        x = self.conv1(x,edge_index)
        x = F.relu(x)
        x = F.dropout(x,training=self.training)
        x = self.conv2(x,edge_index)
        x = F.relu(x)
        x = F.dropout(x,training=self.training)
        
        x = self.lin(x)
        return F.log_softmax(x,dim=1)
    


In [58]:
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
weight = torch.tensor([1,1.075],dtype=torch.float).to(device)

model.train()
for epoch in range(50):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[],data.y,weight=weight)
    loss.backward()
    optimizer.step
    if epoch%5==0:
        print(loss)
        _,pred = model(data).max(dim=1) 
        print(pred.eq(data.y).sum().item()/data.y.shape[0])

tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)
0.6750931605781164
tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)
0.674996905222715
tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)
0.6750962733134901
tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)
0.6753460104669319
tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)
0.6751274006672268
tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)
0.6751913314629785
tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)
0.6752758542004329
tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
0.675027314252904
tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
0.6748395923657531
tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)
0.6751312317261482


### evalutate

In [55]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
train['pred'] = pred.to('cpu').numpy()
print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))
print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))
print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))
print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))

accuracy:  0.6830433261636661
recall:  0.149021881330272
precision:  0.3520523736846333
f1:  0.2094041213580665


In [56]:
train.pred.value_counts()

0    3678431
1     497960
Name: pred, dtype: int64

In [57]:
train.behavior_type.fillna(0).value_counts()

0.0    3000000
1.0    1176391
Name: behavior_type, dtype: int64