<a href="https://colab.research.google.com/github/Rmm6699/HW/blob/master/W8_TA_RecSysGNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 永豐銀行 AI人才培育課程--個人化商品推薦系統 #3

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('./data/raw/yoochoose-clicks.dat')
buy_item_dict = np.load('./data/raw/yoochoose-buys.npy', allow_pickle=True).item()

In [None]:
df.drop(4).reset_index(drop=True)

Unnamed: 0,user_id,timestamp,item_id,category
0,10219,2014-04-07T18:02:10.363Z,1768,0
1,10219,2014-04-07T18:07:21.344Z,1118,0
2,10219,2014-04-07T18:12:10.800Z,711,0
3,11147,2014-04-03T15:08:48.448Z,1761,0
4,11147,2014-04-03T15:10:32.687Z,241,0
...,...,...,...,...
5376,11289324,2014-09-28T12:01:51.100Z,2615,20
5377,11289324,2014-09-28T12:02:22.153Z,2235,20
5378,11303431,2014-09-24T10:32:54.219Z,2296,5
5379,11303431,2014-09-24T10:34:02.287Z,2296,5


In [None]:
buy_item_dict

{124227: [864, 864],
 234892: [1204, 1204],
 304679: [1164],
 358676: [1512, 1489, 1489, 1512],
 468904: [865, 354],
 484992: [334],
 504357: [862, 1338],
 509067: [1978, 1989, 1983],
 612124: [864],
 826473: [393],
 1047341: [988],
 1061114: [1895, 1372],
 1132331: [540],
 1549606: [1568, 1045],
 1610752: [1669],
 1804214: [828],
 2166732: [1699, 1742, 1494, 694, 1659, 1820],
 2455183: [1627, 1668, 1372, 1374],
 2619711: [68],
 2676913: [160],
 2691321: [1741, 553, 1673, 1703],
 2775817: [1215, 1215, 1206],
 2806516: [443, 1647],
 2811569: [475, 1702, 474],
 2992271: [2645, 2645],
 3096232: [965, 960],
 3413956: [1413, 1409, 1415],
 3459484: [1252],
 4487171: [1905, 2050],
 4506726: [180, 181, 181, 180],
 4533094: [1760, 870, 1498],
 4534956: [2160, 1792, 1967],
 4888738: [1810, 1811, 1811, 1802, 1655],
 5066864: [1448, 1483, 1720],
 5177043: [863],
 5286913: [2071, 2016],
 5307603: [1682],
 5439463: [1606],
 5452941: [380, 402, 381, 1285, 1797, 1880, 1885, 556, 2026],
 5462764: [2085

## 1. Create Own Dataset
<img src="./data/image/Graph_data_.png" width="80%">

In [None]:
class YooChooseDataset(Dataset):
    def __init__(self, root):
        super().__init__(root)
        self.data, self.num_items, self.num_categories = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['yoochoose-clicks.dat', 'yoochoose-buys.npy']
    
    @property
    def processed_file_names(self):
        return ['yoochoose-clicks-processed.dataset']

    def download(self):
        pass

    def process(self):
        
        df = pd.read_csv(self.raw_paths[0])
        buy_item_dict = np.load(self.raw_paths[1], allow_pickle=True).item()
        
        num_items = df.item_id.nunique()
        num_categories = df.category.nunique()
        
        # process by user_id
        graph_list = []
        grouped = df.groupby('user_id')
        for user_id, group in tqdm(grouped):
            # 1. Create Graph edges
            # Re-encode the item_id
            le = LabelEncoder()
            group['local_item_id'] = le.fit_transform(group.item_id) # Create new column for new item ids
            
            # If Purchase order = [1,2,3,4], source_nodes = [1,2,3], target_nodes = [2,3,4]
            source_nodes = group.local_item_id.values[:-1]
            target_nodes = group.local_item_id.values[1:]
            edge_index = torch.LongTensor(np.array([source_nodes, target_nodes]))
            
            # 2. Create node_features
            temp_features = group.loc[group.user_id==user_id,['local_item_id','item_id','category']].sort_values('local_item_id')
            node_features = temp_features[['item_id','category']].drop_duplicates().values
            node_features = torch.LongTensor(node_features)
            
            # 3. Create Labels
            if user_id in buy_item_dict:
                positive_indices = le.transform(buy_item_dict[user_id]) # buy_item_dict {468904(user_id):[865, 354](item_ids)}
                label = np.zeros(len(node_features))
                label[positive_indices] = 1 # [0, 0, 1, 0]
            else:
                label = [0] * len(node_features) # [0, 0, 0, 0]
            y = torch.FloatTensor(label)
            
            # 4. Combine node_features, edge_index and labels into Data(x, edge_index, y)
            data = Data(x=node_features, edge_index=edge_index, y=y) # Represent a graph
            graph_list.append(data)
            
        # 5. Save processed data
        torch.save((graph_list, num_items, num_categories), self.processed_paths[0])
        
    def len(self):
        return len(self.data)

    def get(self, idx):
        return self.data[idx]

In [None]:
dataset = YooChooseDataset('./data')

In [None]:
dataset.data[:10]

[Data(x=[3, 2], edge_index=[2, 2], y=[3]),
 Data(x=[4, 2], edge_index=[2, 6], y=[4]),
 Data(x=[3, 2], edge_index=[2, 2], y=[3]),
 Data(x=[3, 2], edge_index=[2, 2], y=[3]),
 Data(x=[4, 2], edge_index=[2, 5], y=[4]),
 Data(x=[2, 2], edge_index=[2, 8], y=[2]),
 Data(x=[4, 2], edge_index=[2, 3], y=[4]),
 Data(x=[9, 2], edge_index=[2, 8], y=[9]),
 Data(x=[12, 2], edge_index=[2, 13], y=[12]),
 Data(x=[1, 2], edge_index=[2, 2], y=[1])]

In [None]:
dataset.num_items, dataset.num_categories

(2646, 21)

In [None]:
dataset = dataset.shuffle()
one_tenth_length = int(len(dataset) * 0.1)
train_dataset = dataset[:one_tenth_length * 8]
val_dataset = dataset[one_tenth_length*8:one_tenth_length * 9]
test_dataset = dataset[one_tenth_length*9:]

len(train_dataset), len(val_dataset), len(test_dataset)

(800, 100, 100)

In [None]:
batch_size= 128
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## 2. Build GNN Model
<img src="./data/image/GNN.png" width="80%">

In [None]:
class Net(torch.nn.Module):
    def __init__(self, embed_dim, num_items, num_categories):
        super(Net, self).__init__()
        
        self.item_embedding = torch.nn.Embedding(num_embeddings=num_items, embedding_dim=embed_dim)
        self.category_embedding = torch.nn.Embedding(num_embeddings=num_categories, embedding_dim=embed_dim)
        
        # GNN Layers
        self.conv1 = GCNConv(embed_dim * 2, embed_dim)
        self.conv2 = GCNConv(embed_dim, embed_dim)
        
        # Readout Layers
        self.linear1 = torch.nn.Linear(embed_dim, embed_dim*2)
        self.linear2 = torch.nn.Linear(embed_dim*2, embed_dim)
  
    def forward(self, x, edge_index, batch):
        
        # x.shape, edge_index.shape, batch.shape
        # torch.Size([n_items in this batch, 1, 2]) torch.Size([2, |E|]) torch.Size([n_items in this batch])
        # n_items in this batch : Only count the number of items in this batch
        
        # 1. process sparse features into dense embeddings
        item_id = x[:, 0]
        category = x[:, 1]
        emb_item = self.item_embedding(item_id) # torch.Size([n_items in this batch, 128])
        emb_category = self.category_embedding(category) # torch.Size([n_items in this batch, 128])
        x = torch.cat([emb_item, emb_category], dim=1) # torch.Size([n_items in this batch, 256(128+128)])
        
        # 2. put node feature and edge_index into GNN layer and activation function (Layer1)
        x = F.relu(self.conv1(x, edge_index)) # torch.Size([n_items in this batch, 128])
        
        # 3. Conduct mean-pooling by user's id (pooling Layer1)
        x1 = global_mean_pool(x, batch) # torch.Size([128, 128]) 
        
        # 4. put outputs of Layer1 and edge_index into GNN layer and activation function (Layer2)
        x = F.relu(self.conv2(x, edge_index)) # torch.Size([n_items in this batch, 128])
        
        # 5. Conduct mean-pooling by user's id (pooling Layer2)
        x2 = global_mean_pool(x, batch) # torch.Size([128, 128])
        
        # 6. Combine the results from pooling Layer1 and pooling Layer2
        x = x1 + x2

        # 7. Readout layer
        x = F.relu(self.linear1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.linear2(x)
        # x represent the user embedding which includes information about the purchase history of the user 
        
        scores = []
        for i in range(x.size(0)):
            user_embeddings = x[i, :] # torch.Size([128])
            item_embeddings = emb_item[batch == i] # torch.Size([n_items, 128])
            score = torch.matmul(item_embeddings, user_embeddings) #　torch.Size([n_items])
            scores.append(score)
              
        x = torch.cat(scores, dim=0) # torch.Size([n_items])

        return torch.sigmoid(x) # to 0~1 socre

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = Net(embed_dim=128, num_items=dataset.num_items, num_categories=dataset.num_categories).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()

## 3. Train model

In [None]:
def train():
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        x, edge_index, batch = data.x, data.edge_index, data.batch
        y_ture = data.y.to(device)
        
        optimizer.zero_grad()
        
        y_pred = model(x, edge_index, batch)
    
        loss = criterion(y_pred, y_ture)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()

    return data, loss_all / len(train_dataset)

In [None]:
def evaluate(loader):
    model.eval()
    
    predictions = []
    labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            x, edge_index, batch = data.x, data.edge_index, data.batch
            y_pred = model(x, edge_index, batch).detach().cpu().numpy()

            y_true = data.y.detach().cpu().numpy()
            predictions.append(y_pred)
            labels.append(y_true)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    
    return roc_auc_score(labels, predictions)

In [None]:
for epoch in range(20):
    d, loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

Epoch: 000, Loss: 1.08964, Train Auc: 0.59776, Val Auc: 0.49523, Test Auc: 0.41228
Epoch: 001, Loss: 0.65176, Train Auc: 0.66775, Val Auc: 0.53180, Test Auc: 0.48776
Epoch: 002, Loss: 0.56379, Train Auc: 0.72467, Val Auc: 0.55016, Test Auc: 0.47782
Epoch: 003, Loss: 0.51267, Train Auc: 0.77348, Val Auc: 0.55812, Test Auc: 0.47815
Epoch: 004, Loss: 0.46621, Train Auc: 0.80769, Val Auc: 0.54898, Test Auc: 0.49104
Epoch: 005, Loss: 0.40988, Train Auc: 0.83709, Val Auc: 0.53328, Test Auc: 0.48962
Epoch: 006, Loss: 0.37364, Train Auc: 0.87164, Val Auc: 0.55477, Test Auc: 0.49192
Epoch: 007, Loss: 0.33365, Train Auc: 0.90247, Val Auc: 0.57297, Test Auc: 0.49421
Epoch: 008, Loss: 0.28783, Train Auc: 0.92766, Val Auc: 0.58313, Test Auc: 0.51802
Epoch: 009, Loss: 0.26544, Train Auc: 0.94826, Val Auc: 0.58859, Test Auc: 0.52884
Epoch: 010, Loss: 0.22855, Train Auc: 0.96212, Val Auc: 0.56727, Test Auc: 0.52272
Epoch: 011, Loss: 0.21142, Train Auc: 0.97256, Val Auc: 0.57336, Test Auc: 0.53605
Epoc