In [1]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

from torch_geometric.nn import SAGEConv, to_hetero

customer_path = 'dataset/customer.csv'
sales_path = 'dataset/sales.csv'
goods_path = 'dataset/goods.csv'
category_path = 'dataset/x_tree.csv'
publisher_path = 'dataset/x_tag.csv'

customer_df = pd.read_csv(customer_path)
sales_df = pd.read_csv(sales_path)
goods_df = pd.read_csv(goods_path)
category_df = pd.read_csv(category_path)
publisher_df = pd.read_csv(publisher_path)

catgory_col = 'catgory'
good_id_col = 'good_id'
brand_col = 'brand'
factorize_brand_col = 'f_brand'
tags_col = 'tag'

factorize_gender_col = 'f_gender'
customer_id_col = 'customer_id'

factorize_customer_id = 'f_customer_id'
factorize_good_id = 'f_good_id'

mlb = MultiLabelBinarizer()
category_template = goods_df[[good_id_col, catgory_col]]
category_split = category_template.catgory \
    .str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

catgory_encoded = pd.DataFrame(mlb.fit_transform(category_split), columns=mlb.classes_, index=goods_df.index)
category_feature = pd.concat([goods_df['good_id'], catgory_encoded], axis=1)

auther_feature = goods_df[['good_id','author']]
auther_feature = auther_feature.assign(author=auther_feature['author'].str.split('|').str[0])
author_counts = auther_feature['author'].value_counts()
index_of_author_that_have_more_then_one_book = author_counts[author_counts > 1].index
auther_feature.loc[~auther_feature['author'].isin(index_of_author_that_have_more_then_one_book), 'author'] = 0
mask = auther_feature['author'] == 0
auther_feature.loc[~mask, 'author'] = pd.factorize(auther_feature.loc[~mask, 'author'])[0]+1
auther_feature.author = auther_feature.author.astype(int)

brands_template = goods_df[[good_id_col, brand_col]]
brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]
brands_feature = brands_template[[good_id_col, factorize_brand_col]]

tags_template = goods_df[[good_id_col, tags_col]]
tags_split = tags_template[tags_col].str.split('|') \
    .apply(lambda x: list(map(int, x)) if isinstance(x, list) else []).tolist()

tags_encoded = pd.DataFrame(mlb.fit_transform(tags_split), columns=mlb.classes_, index=goods_df.index)
tags_feature = pd.concat([goods_df['good_id'], tags_encoded], axis=1)

good_feature = category_feature.merge(auther_feature, on=good_id_col, how='left')
good_feature = good_feature.merge(brands_feature,on=good_id_col, how='left')
good_feature = good_feature.merge(tags_feature, on=good_id_col)

customer_df[factorize_gender_col] = pd.factorize(customer_df.gender)[0]
customer_feature = customer_df[[customer_id_col, factorize_gender_col]]

sales_df.rename(columns={'member_id' : customer_id_col}, inplace=True)

unique_customer_id = sales_df[customer_id_col].drop_duplicates()
sales_customer_featuer = customer_feature[customer_feature.customer_id.isin(unique_customer_id)]

sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]
customerIds_for_merge = sales_customer_featuer[[customer_id_col, factorize_customer_id]]

sales_df = sales_df.merge(customerIds_for_merge, on=customer_id_col, how='left')

unique_goods_id = sales_df[good_id_col].drop_duplicates()
sales_good_feature = good_feature[good_feature.good_id.isin(unique_goods_id)]

sales_good_feature[factorize_good_id] = pd.factorize(sales_good_feature.good_id)[0]
goodIds_for_merge = sales_good_feature[[good_id_col, factorize_good_id]]

sales_df = sales_df.merge(goodIds_for_merge , on=good_id_col, how='left')

sales_df.fillna(value={factorize_good_id : 0}, inplace=True)

adjacency_matrix = sales_df[[factorize_customer_id, factorize_good_id]]

good = 'good'
customer = 'customer'
buy = 'buy'
rev_buy = 'rev_buy'

good_feature = torch.from_numpy(good_feature.values).to(torch.float)

customer_feature = torch.from_numpy(customer_feature.values).to(torch.float)

edge_index = torch.stack([
    torch.tensor(adjacency_matrix[factorize_customer_id].values),
    torch.tensor(adjacency_matrix[factorize_good_id].values)]
    , dim=0)

hetro_data = HeteroData()

hetro_data[good].x = good_feature
hetro_data[customer].x = customer_feature
hetro_data[customer, buy, good].edge_index = edge_index
#hetro_data[customer, buy, good].edge_label = edge_index ??????????
hetro_data = T.ToUndirected()(hetro_data)

del hetro_data[good, rev_buy, customer].edge_label

hetro_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brands_template[factorize_brand_col] = pd.factorize(brands_template.brand)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_customer_featuer[factorize_customer_id] = pd.factorize(sales_customer_featuer.customer_id)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_good_feature[factoriz

HeteroData(
  good={ x=[57093, 1834] },
  customer={ x=[204918, 2] },
  (customer, buy, good)={ edge_index=[2, 525088] },
  (good, rev_buy, customer)={ edge_index=[2, 525088] }
)

In [2]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=2,
    add_negative_train_samples=True,
    edge_types=[(customer, buy, good)],
    rev_edge_types=[(good, rev_buy, customer)],
)(hetro_data)
train_data, val_data, test_data

(HeteroData(
   good={ x=[57093, 1834] },
   customer={ x=[204918, 2] },
   (customer, buy, good)={
     edge_index=[2, 420072],
     edge_label=[1260216],
     edge_label_index=[2, 1260216],
   },
   (good, rev_buy, customer)={ edge_index=[2, 420072] }
 ),
 HeteroData(
   good={ x=[57093, 1834] },
   customer={ x=[204918, 2] },
   (customer, buy, good)={
     edge_index=[2, 420072],
     edge_label=[157524],
     edge_label_index=[2, 157524],
   },
   (good, rev_buy, customer)={ edge_index=[2, 420072] }
 ),
 HeteroData(
   good={ x=[57093, 1834] },
   customer={ x=[204918, 2] },
   (customer, buy, good)={
     edge_index=[2, 472580],
     edge_label=[157524],
     edge_label_index=[2, 157524],
   },
   (good, rev_buy, customer)={ edge_index=[2, 472580] }
 ))

In [5]:
train_data[customer, good].edge_lable

AttributeError: 'EdgeStorage' object has no attribute 'edge_lable'