In [1]:
%%time
import pandas as pd
import os
import numpy as np
import torch
import torch_geometric
from sklearn.preprocessing import LabelEncoder
import gc
from torch_geometric.data import Data

# item_feature = pd.read_csv('../data/item_feature.csv')
# test = pd.read_csv('../data/test.csv')
# user_feature = pd.read_csv('../data/user_feature.csv')
train = pd.read_pickle('../data/sml_train.pkl')

  from ._conv import register_converters as _register_converters


Wall time: 53.9 s


In [2]:
train.head()

Unnamed: 0,user_id,item_id,behavior_type,date
0,1732029000.0,319336400.0,clk,2019-06-19
2,1732029000.0,1197152000.0,clk,2019-06-19
3,1732029000.0,1145630000.0,clk,2019-06-19
5,1732029000.0,1162473000.0,clk,2019-06-19
7,1732029000.0,1128524000.0,clk,2019-06-19


### 加入edge信息的GCN(时间序列思路，未解决推荐问题和冷启动问题)
1. 根据历史 __所有behavior边__ 预测未来可能在这些边中是clk的概率
2. __behavior__ 用 __user-item__ 二部图来表示， __user__ 和 __item__ 作为节点，边属性为 __behavior_type__
3. 更新每个user/item feature时，concat 1-hop node embedding和edge_embedding，然后求均值作为新的user/item feature
4. 最后concat __user_embedding__ , __item_embedding__ 作为user-item pair 的feature, 该user-item pair在未来是否发生clk作为label

In [3]:
%%time
his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id','behavior_type'])
now = train[(train.date>='2019-06-20')&(train.behavior_type=='clk')].drop_duplicates(subset=['user_id','item_id'])
del train
now.rename(columns={'behavior_type':'label'},inplace=True)
train = his[['user_id','item_id','behavior_type']].merge(now[['user_id','item_id','label']],how='left')
del his,now
gc.collect()
train['label'] = train['label'].map({'clk':1})
train['label'] = train['label'].fillna(0) 

Wall time: 1min 29s


### build model

In [5]:
%%time
train = pd.concat([train[train.label==1],train[train.label==0].sample(2000000)],axis=0)
# 显存不足，下采样

u_enc,i_enc,e_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id']),LabelEncoder().fit(train['behavior_type'])
train['user_id'] = u_enc.transform(train['user_id'])
train['item_id'] = i_enc.transform(train['item_id'])
train['behavior_type'] = e_enc.transform(train['behavior_type'])

Wall time: 18.8 s


In [11]:
# clk rate
train.label.sum()/(train.shape[0]-train.label.sum())

0.860387

#### no edge GCN   
不利用边属性信息，只用节点的embedding进行GCN

In [6]:
edge_index = torch.tensor([train['user_id'].values,train['item_id'].values+1+train['user_id'].max()])
edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)
u = torch.tensor(train['user_id'].unique().reshape(-1,1))
i = torch.tensor(train['item_id'].unique().reshape(-1,1))
e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))
y = torch.tensor(train.label.values,dtype=torch.long)
data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)

In [7]:
from torch_geometric.nn import GCNConv,MessagePassing
import torch.nn.functional as F

# hyper param
EMB_DIM = 10

class noedge_GCN(torch.nn.Module):
    def __init__(self):
        super(noedge_GCN, self).__init__()
        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)
        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)
        self.conv1 = GCNConv(EMB_DIM,6)
        self.conv2 = GCNConv(6,4)
        self.lin = torch.nn.Linear(8,2)
        
    def forward(self, data):
        u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr
        
        emb_u = self.u_emb(u).view(-1,EMB_DIM)
        emb_i = self.i_emb(i).view(-1,EMB_DIM)
        x = torch.cat([emb_u, emb_i],dim=0)
        x = self.conv1(x,edge_index)
        x = F.relu(x)
        x = F.dropout(x,training=self.training)
        x = self.conv2(x,edge_index)
        x = F.relu(x)
        x = F.dropout(x,training=self.training)
        x = torch.cat([x[edge_index[0]],x[edge_index[1]]],dim=1)
        x = self.lin(x)
        x = F.dropout(x,training=self.training)
        
        return F.log_softmax(x,dim=1)
        

model = noedge_GCN()

In [12]:
%%time
model = noedge_GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参，需要调参！！！

model.train()
for epoch in range(10):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out, data.y,weight=weight)
    loss.backward()
    optimizer.step()
    if epoch%1==0:
        model.eval()
        print('epoch_{} loss: {}'.format(epoch,loss.item()))

epoch_0 loss: 0.8651493787765503
epoch_1 loss: 0.7528172135353088
epoch_2 loss: 0.7403311133384705
epoch_3 loss: 0.7339281439781189
epoch_4 loss: 0.7297651171684265
epoch_5 loss: 0.7220962643623352
epoch_6 loss: 0.7202253341674805
epoch_7 loss: 0.7201514840126038
epoch_8 loss: 0.7203397750854492
epoch_9 loss: 0.721700131893158
Wall time: 3min 35s


In [14]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
model.eval()
_,pred = model(data).max(dim=1)
train['pred'] = pred.numpy()
print('accuracy: ',accuracy_score(train['label'],train['pred']))
print('recall: ',recall_score(train['label'], train['pred']))
print('precision: ',precision_score(train['label'], train['pred']))
print('f1: ',f1_score(train['label'], train['pred']))

accuracy:  0.46687463414870134
recall:  0.9499719312355951
precision:  0.4627905982579322
f1:  0.6223807175044113


#### edge GCN
同时利用node_embedding和edge_embedding做GCN，最终每个节点的embedding是融合了该节点领域的node_embedding和edge_embedding  
魔改GCN

In [18]:
edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])
edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)
u = torch.tensor(train['user_id'].unique().reshape(-1,1))
i = torch.tensor(train['item_id'].unique().reshape(-1,1))
e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))
y = torch.tensor(train.label.values,dtype=torch.long)
data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)

In [32]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class edgeGCN(MessagePassing):
    
    def __init__(self, in_channels, out_channels, flow, aggr='add', **kwargs):
        super(edgeGCN, self).__init__(aggr=aggr,flow=flow)
        self.lin_u = torch.nn.Linear(in_channels, out_channels)
        self.lin_i = torch.nn.Linear(in_channels, out_channels)
        self.lin_e = torch.nn.Linear(in_channels, out_channels)
        self.lin_aggr = torch.nn.Linear(out_channels*2, out_channels)
        self.flow = flow
        
    def forward(self,u,i,e,edge_index,edge_type):
        
        # linear transformation
        u = self.lin_u(u)
        u = F.relu(u)
        u = F.dropout(u)
        i = self.lin_i(i)
        i = F.relu(i)
        i = F.dropout(i)
        e = self.lin_e(e)
        e = F.relu(e)
        e = F.dropout(e)        
        
        return self.propagate(x=(u,i),e=e,edge_index=edge_index,edge_type=edge_type,size=(u.size(0), i.size(0)))
    
    def message(self,x_j, x_i, e, edge_index, edge_type,size):
        
        # x_i is user_embedding
        # x_j is item_embedding
        
        # get normalized laplacian
        row,col = edge_index
        deg_i = degree(row, size[0], dtype=x_i.dtype)
        deg_j = degree(col, size[1], dtype=x_j.dtype)
        deg_inv_sqrt_i = deg_i.pow(-0.5)
        deg_inv_sqrt_j = deg_j.pow(-0.5)
        norm = deg_inv_sqrt_i[row]*deg_inv_sqrt_j[col]
        
        # concat neighbor nodes embedding and edge embedding
        if self.flow == 'target_to_source':
            emb = torch.cat([x_j, e[edge_type]], dim=1)
        else:
            emb = torch.cat([x_i, e[edge_type]], dim=1)
        return norm.view(-1,1)*emb
        
    def update(self, aggr_out):
        return self.lin_aggr(aggr_out)

In [69]:
# hyper param
EMB_DIM = 10

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)
        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)
        self.e_emb = torch.nn.Embedding(len(e),EMB_DIM)
        self.e_lin_1 = torch.nn.Linear(EMB_DIM,6)
        self.u_gcn_1 = edgeGCN(EMB_DIM,6,flow='target_to_source')
        self.i_gcn_1 = edgeGCN(EMB_DIM,6,flow='source_to_target')
        self.u_gcn_2 = edgeGCN(6,2,flow='target_to_source')
        self.i_gcn_2 = edgeGCN(6,2,flow='source_to_target')
        self.lin = torch.nn.Linear(4,2)
        
    def forward(self, data):
        u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr
        
        u_emb = self.u_emb(u).view(-1,EMB_DIM)
        i_emb = self.i_emb(i).view(-1,EMB_DIM)
        e_emb = self.e_emb(e).view(-1,EMB_DIM)
        
        x_u,x_i = self.u_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\
                    self.i_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr)
        e_emb = self.e_lin_1(e_emb)
        x_u,x_i = self.u_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\
                    self.i_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr)

        
        x = torch.cat([x_u[edge_index[0]],x_i[edge_index[1]]],dim=1)
        x = self.lin(x)
        x = F.dropout(x,training=self.training)
        
        return F.log_softmax(x,dim=1)

In [73]:
%%time
model = Net()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参，需要调参！！！

model.train()

# train param
patience = 0
best_loss = 1

for epoch in range(10):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out, data.y,weight=weight)
    
    if epoch%1==0:
        model.eval()
        print('epoch_{} loss: {}'.format(epoch,loss.item()))
        
    if loss>best_loss:
        patience += 1
        if patience == 2: 
            break
    else:
        patience = 0
        best_loss = loss
        
    loss.backward()
    optimizer.step()

epoch_0 loss: 0.8134878873825073
epoch_1 loss: 0.7332140207290649
epoch_2 loss: 0.7305666208267212
epoch_3 loss: 0.7313019633293152
Wall time: 1min 36s


In [74]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
model.eval()
_,pred = model(data).max(dim=1)
train['pred'] = pred.numpy()
print('accuracy: ',accuracy_score(train['label'],train['pred']))
print('recall: ',recall_score(train['label'], train['pred']))
print('precision: ',precision_score(train['label'], train['pred']))
print('f1: ',f1_score(train['label'], train['pred']))

accuracy:  0.5050347051446823
recall:  0.5282611197054349
precision:  0.468827943036212
f1:  0.4967732239615924
