In [59]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from torch import Tensor
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

from torch_geometric.nn import SAGEConv, to_hetero

In [60]:
customer_path = 'dataset/customer.csv'
sales_path = 'dataset/sales.csv'
goods_path = 'dataset/goods.csv'
category_path = 'dataset/x_tree.csv'
publisher_path = 'dataset/x_tag.csv'

customer_df = pd.read_csv(customer_path)
sales_df = pd.read_csv(sales_path)
goods_df = pd.read_csv(goods_path)
category_df = pd.read_csv(category_path)
publisher_df = pd.read_csv(publisher_path)

# Create Book Feature

In [61]:
catgory_col = 'catgory'
good_id_col = 'good_id'
brand_col = 'brand'
factorize_brand_col = 'f_brand'
tags_col = 'tag'

factorize_gender_col = 'f_gender'
customer_id_col = 'customer_id'

factorize_customer_id = 'f_customer_id'
factorize_good_id = 'f_good_id'

good = 'good'
customer = 'customer'
buy = 'buy'
rev_buy = 'rev_buy'

### Category Feature

In [62]:
mlb = MultiLabelBinarizer()
category_template = goods_df[[good_id_col, catgory_col]]
category_split = category_template.catgory \
    .str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

catgory_encoded = pd.DataFrame(mlb.fit_transform(category_split), columns=mlb.classes_, index=goods_df.index)
category_feature = pd.concat([goods_df['good_id'], catgory_encoded], axis=1)

### Author Feature

In [63]:
auther_feature = goods_df[['good_id','author']]
auther_feature = auther_feature.assign(author=auther_feature['author'].str.split('|').str[0])
author_counts = auther_feature['author'].value_counts()
index_of_author_that_have_more_then_one_book = author_counts[author_counts > 1].index
auther_feature.loc[~auther_feature['author'].isin(index_of_author_that_have_more_then_one_book), 'author'] = 0
mask = auther_feature['author'] == 0
auther_feature.loc[~mask, 'author'] = pd.factorize(auther_feature.loc[~mask, 'author'])[0]+1
auther_feature.author = auther_feature.author.astype(int)

### Brand Feature

In [64]:
brands_template = goods_df[[good_id_col, brand_col]]
brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]
brands_feature = brands_template[[good_id_col, factorize_brand_col]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]


### Tags Feature

In [65]:
tags_template = goods_df[[good_id_col, tags_col]]
tags_split = tags_template[tags_col].str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

tags_encoded = pd.DataFrame(mlb.fit_transform(tags_split), columns=mlb.classes_, index=goods_df.index)
tags_feature = pd.concat([goods_df['good_id'], tags_encoded], axis=1)

### Concatinate All Feature

In [66]:
good_feature = category_feature.merge(auther_feature, on=good_id_col, how='left')
good_feature = good_feature.merge(brands_feature,on=good_id_col, how='left')
good_feature = good_feature.merge(tags_feature, on=good_id_col)

In [67]:
good_feature.head()

Unnamed: 0,good_id,29,31,32,35_x,37,38,40,41,42_x,...,48962,52548,56067,56068,57828,59533,60904,62296,64050,67460
0,162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,163,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,165,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Create Customer Feature

In [68]:
customer_df[factorize_gender_col] = pd.factorize(customer_df.gender)[0]
customer_feature = customer_df[[customer_id_col, factorize_gender_col]]

In [69]:
customer_feature.head()

Unnamed: 0,customer_id,f_gender
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


# Sales Operation

In [70]:
sales_df.rename(columns={'member_id' : customer_id_col}, inplace=True)

### Customer Operation

In [71]:
unique_customer_id = sales_df[customer_id_col].drop_duplicates()
sales_customer_featuer = customer_feature[customer_feature.customer_id.isin(unique_customer_id)]

sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]
customerIds_for_merge = sales_customer_featuer[[customer_id_col, factorize_customer_id]]

sales_df = sales_df.merge(customerIds_for_merge, on=customer_id_col, how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]


### Good Operation

In [72]:
unique_goods_id = sales_df[good_id_col].drop_duplicates()
sales_good_feature = good_feature[good_feature.good_id.isin(unique_goods_id)]

sales_good_feature[factorize_good_id] = pd.factorize(sales_good_feature.good_id)[0]
goodIds_for_merge = sales_good_feature[[good_id_col, factorize_good_id]]

sales_df = sales_df.merge(goodIds_for_merge , on=good_id_col, how='left')

sales_df.fillna(value={factorize_good_id : 0}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_good_feature[factorize_good_id] = pd.factorize(sales_good_feature.good_id)[0]


In [73]:
sales_df

Unnamed: 0,invoice_id,register_date,customer_id,province_id,city_id,good_id,f_customer_id,f_good_id
0,2,2014-05-11 08:31:37,5,4.0,72.0,15347,1.0,5255.0
1,5,2014-05-12 15:16:06,8,8.0,110.0,2833,3.0,1453.0
2,5,2014-05-12 15:16:06,8,8.0,110.0,8714,3.0,3200.0
3,7,2014-05-12 22:23:19,9,27.0,337.0,15672,4.0,0.0
4,11,2014-05-16 11:55:43,12,8.0,121.0,15001,5.0,5136.0
...,...,...,...,...,...,...,...,...
525083,263306,2023-06-14 17:06:29,205809,4.0,977.0,35191,95692.0,13615.0
525084,263306,2023-06-14 17:06:29,205809,4.0,977.0,37009,95692.0,14570.0
525085,263306,2023-06-14 17:06:29,205809,4.0,977.0,77359,95692.0,34477.0
525086,263307,2023-06-14 17:21:18,183076,10.0,134.0,26552,85153.0,9833.0


In [74]:
adjacency_matrix = sales_df[[factorize_customer_id, factorize_good_id]]
adjacency_matrix

Unnamed: 0,f_customer_id,f_good_id
0,1.0,5255.0
1,3.0,1453.0
2,3.0,3200.0
3,4.0,0.0
4,5.0,5136.0
...,...,...
525083,95692.0,13615.0
525084,95692.0,14570.0
525085,95692.0,34477.0
525086,85153.0,9833.0


# Create HetroData

In [105]:
customer_feature_numpy = torch.from_numpy(customer_feature.values).to(torch.int32)
good_feature_numpy = torch.from_numpy(good_feature.values).to(torch.int32)

buying_customer_id = torch.from_numpy(adjacency_matrix[factorize_customer_id].values).to(torch.int32)
buying_good_id = torch.from_numpy(adjacency_matrix[factorize_good_id].values).to(torch.int32)

#edge_index = torch.stack([
#    torch.tensor(buying_customer_id),
#    torch.tensor(buying_good_id)],
#    dim=0)

edge_index = torch.stack([
    buying_customer_id,
    buying_good_id],
    dim=0)

In [106]:
edge_index

tensor([[    1,     3,     3,  ..., 95692, 85153, 95693],
        [ 5255,  1453,  3200,  ..., 34477,  9833, 38705]], dtype=torch.int32)

In [107]:
torch.arange(len(customer_feature_numpy))

tensor([     0,      1,      2,  ..., 204915, 204916, 204917])

In [108]:
torch.arange(len(good_feature_numpy))

tensor([    0,     1,     2,  ..., 57090, 57091, 57092])

In [109]:
hetro_data = HeteroData()

hetro_data[customer].node_id = torch.arange(len(customer_feature_numpy))
hetro_data[good].node_id = torch.arange(len(good_feature_numpy))

hetro_data[good].x = good_feature_numpy
hetro_data[customer].x = customer_feature_numpy
hetro_data[customer, buy, good].edge_index = edge_index
hetro_data = T.ToUndirected()(hetro_data)

del hetro_data[good, rev_buy, customer].edge_label

In [110]:
hetro_data

HeteroData(
  customer={
    node_id=[204918],
    x=[204918, 2],
  },
  good={
    node_id=[57093],
    x=[57093, 1834],
  },
  (customer, buy, good)={ edge_index=[2, 525088] },
  (good, rev_buy, customer)={ edge_index=[2, 525088] }
)

# Train Val Test  spliting

In [111]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=2,
    add_negative_train_samples=True,
    edge_types=[(customer, buy, good)],
    rev_edge_types=[(good, rev_buy, customer)],
)(hetro_data)
train_data, val_data, test_data

(HeteroData(
   customer={
     node_id=[204918],
     x=[204918, 2],
   },
   good={
     node_id=[57093],
     x=[57093, 1834],
   },
   (customer, buy, good)={
     edge_index=[2, 420072],
     edge_label=[1260216],
     edge_label_index=[2, 1260216],
   },
   (good, rev_buy, customer)={ edge_index=[2, 420072] }
 ),
 HeteroData(
   customer={
     node_id=[204918],
     x=[204918, 2],
   },
   good={
     node_id=[57093],
     x=[57093, 1834],
   },
   (customer, buy, good)={
     edge_index=[2, 420072],
     edge_label=[157524],
     edge_label_index=[2, 157524],
   },
   (good, rev_buy, customer)={ edge_index=[2, 420072] }
 ),
 HeteroData(
   customer={
     node_id=[204918],
     x=[204918, 2],
   },
   good={
     node_id=[57093],
     x=[57093, 1834],
   },
   (customer, buy, good)={
     edge_index=[2, 472580],
     edge_label=[157524],
     edge_label_index=[2, 157524],
   },
   (good, rev_buy, customer)={ edge_index=[2, 472580] }
 ))

In [112]:
train_data[customer, buy, good].edge_label ,train_data[customer, buy, good].edge_label.shape

(tensor([1., 1., 1.,  ..., 0., 0., 0.]), torch.Size([1260216]))

# Create Graph Neural Network

In [113]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_layer):
        super().__init__()
        self.conv1 = SAGEConv((-1,-1), hidden_layer)
        self.conv2 = SAGEConv((-1,-1), hidden_layer)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [114]:
class Classifier(torch.nn.Module):
    
    def forward(self, x_good, x_customer, edge_lable_index):
        edge_feature_good = x_good[edge_label_index[0]]
        edge_feature_customer = x_customer[edge_label_index[1]]
        return (edge_feature_good * edge_feature_customer).sum(dim=-1)

In [115]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        #self.good_lin = torch.nn.Linear(20, hidden_channels)
        self.customer_emb = torch.nn.Embedding(hetro_data[customer].num_nodes, hidden_channels)
        self.good_emb = torch.nn.Embedding(hetro_data[good].num_nodes, hidden_channels)
        self.gnn = GNNEncoder(hidden_channels)
        self.gnn = to_hetero(self.gnn, metadata=hetro_data.metadata())
        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          customer: self.customer_emb(data[customer].node_id),
          good: self.good_emb(data[good].node_id),
        }

        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict[customer],
            x_dict[good],
            data[customer, buy, good].edge_label_index,
        )

        return pred

In [116]:
train_data[good].x[1]

tensor([163,   0,   0,  ...,   0,   0,   0], dtype=torch.int32)

In [117]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [118]:
model = Model(hidden_channels=32).to(device)
model

Model(
  (customer_emb): Embedding(204918, 32)
  (good_emb): Embedding(57093, 32)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (customer__buy__good): SAGEConv((-1, -1), 32, aggr=mean)
      (good__rev_buy__customer): SAGEConv((-1, -1), 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (customer__buy__good): SAGEConv((-1, -1), 32, aggr=mean)
      (good__rev_buy__customer): SAGEConv((-1, -1), 32, aggr=mean)
    )
  )
  (classifier): Classifier()
)

# Training the Model

In [119]:
import torch.nn.functional as F

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data)
    ground_truth = train_data[customer, buy, good].edge_label
    loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(test_data)
    pred = pred.clamp(min=0, max=5)
    target = data[customer, good].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)


for epoch in range(1, 100):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}')


IndexError: Encountered an index error. Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 204917] (got interval [-2147483648, 95692])

In [1]:
print('For save in git')

For save in git
