In [1]:
import numpy as np
import matplotlib.pylab as plt
import scipy.sparse as sparse
from scipy.stats import truncnorm 
import pickle
import pandas as pd

In [2]:
m = 58415 # hyperedge = product = 58415
n = 849185 # node = basket = 849185

In [3]:
labels = [506] * n

In [4]:
train_order_no = []
with open("task1_train_label.txt", "r") as f:
    while True:
        line = f.readline()
        if not line: # 파일 읽기가 종료된 경우
            break
        order, return_label = line.strip().split('\t')
        train_order_no.append(int(order))
        labels[int(order)] = int(return_label)

In [5]:
valid_order_no = []
with open("task1_valid_label.txt", "r") as f:
    while True:
        line = f.readline()
        if not line: # 파일 읽기가 종료된 경우
            break
        order, return_label = line.strip().split('\t')
        valid_order_no.append(int(order))
        labels[int(order)] = int(return_label)

In [6]:
test_order_no = []
with open("task1_test_query.txt", "r") as f:
    while True:
        line = f.readline()
        if not line: # 파일 읽기가 종료된 경우
            break
        order = line.strip()
        test_order_no.append(int(order))
        labels[int(order)] = -1

In [7]:
# We split the dataset into a training set (594,430 orders),
# a  validation set (127,377 orders), 
# and a test set (127,378 orders), following a 70/15/15 distr


assert len(train_order_no) == 594430
assert len(valid_order_no) == 127377
assert len(test_order_no) == 127378
assert sum(np.array(labels) == -1) == 127378
assert sum(np.array(labels) == 506) == 0
assert len(train_order_no) + len(valid_order_no) + len(test_order_no) == n

In [8]:
with open("etail/ours/labels.pickle", 'wb') as f:
    pickle.dump(labels, f)


with open("etail/ours/train_labels.pickle", 'wb') as f:
    pickle.dump(train_order_no, f)

with open("etail/ours/valid_labels.pickle", 'wb') as f:
    pickle.dump(valid_order_no, f)

with open("etail/ours/test_labels.pickle", 'wb') as f:
    pickle.dump(test_order_no, f)

In [9]:
from collections import defaultdict

hypergraph = defaultdict(list)

with open("../data/task1_data.txt", "r") as f:
    f.readline()
    while True:
        line = f.readline()
        if not line: # 파일 읽기가 종료된 경우
            break
        order, product, customer, color, size, group = line.strip().split(',')
        hypergraph[int(product)].append(int(order))


In [10]:
assert len(set(hypergraph.keys())) == m

In [11]:
with open("etail/ours/hypergraph.pickle", 'wb') as f:
    pickle.dump(hypergraph, f)

## Word2Vec

In [12]:
from gensim.models import Word2Vec

df = pd.read_csv("task1_data.txt")


46


In [41]:
# 사용자별 아이템 시퀀스 생성
customer_sequences = df.groupby('customer')['order'].apply(list).tolist()
max_len = max([len(seq) for seq in group_sequences])
print(max_len)

# Word2Vec 모델 학습
model = Word2Vec(sentences=customer_sequences, vector_size=100, window=max_len, min_count=1, workers=32)


keys = [i for i in range(len(model.wv.index_to_key))]
customer = model.wv[keys]



990182


In [42]:
# 사용자별 아이템 시퀀스 생성
size_sequences = df.groupby('size')['order'].apply(list).tolist()
max_len = max([len(seq) for seq in group_sequences])
print(max_len)

# Word2Vec 모델 학습
model = Word2Vec(sentences=size_sequences, vector_size=10, window=max_len, min_count=1, workers=32)


keys = [i for i in range(len(model.wv.index_to_key))]
size = model.wv[keys]



990182


In [43]:
# 사용자별 아이템 시퀀스 생성
color_sequences = df.groupby('color')['order'].apply(list).tolist()
max_len = max([len(seq) for seq in group_sequences])
print(max_len)

# Word2Vec 모델 학습
model = Word2Vec(sentences=color_sequences, vector_size=5, window=max_len, min_count=1, workers=32)


keys = [i for i in range(len(model.wv.index_to_key))]
color = model.wv[keys]



990182


In [44]:
# 사용자별 아이템 시퀀스 생성
group_sequences = df.groupby('group')['order'].apply(list).tolist()
max_len = max([len(seq) for seq in group_sequences])
print(max_len)

# Word2Vec 모델 학습
model = Word2Vec(sentences=group_sequences, vector_size=5, window=max_len, min_count=1, workers=32)


keys = [i for i in range(len(model.wv.index_to_key))]
group = model.wv[keys]


990182


In [45]:
customer.shape

(849185, 100)

In [46]:
color.shape

(849185, 5)

In [47]:
size.shape

(849185, 10)

In [48]:
group.shape

(849185, 5)

In [49]:
np.hstack([customer, color, size, group])

array([[-1.14742024e-02,  1.91101935e-02,  1.66486930e-02, ...,
         1.49638295e-01, -1.63311526e-01, -4.93147597e-02],
       [-7.76894167e-02,  3.59250680e-02,  2.70011816e-02, ...,
         1.76088214e-01,  2.45786663e-02, -4.50493097e-02],
       [-1.02264853e-02,  6.69997046e-03, -7.11473776e-03, ...,
        -1.51098017e-02,  1.64623708e-01, -8.53410959e-02],
       ...,
       [-6.59545069e-04, -3.89122358e-03, -4.09440510e-03, ...,
         7.41773844e-02,  1.47439331e-01,  4.58339229e-02],
       [ 3.79193458e-03,  1.44837098e-03,  5.46176034e-06, ...,
        -1.48334309e-01, -2.43333820e-02,  1.70539737e-01],
       [ 8.32270086e-03, -1.91192201e-03,  2.80565117e-03, ...,
         1.42676204e-01,  1.97445303e-01, -1.18886925e-01]], dtype=float32)

In [50]:
import scipy.sparse as sp
matrix = sp.csr_matrix(np.hstack([customer, color, size, group]))

In [51]:
with open("etail/ours/w2v_concat.pickle", 'wb') as f:
    pickle.dump(matrix, f)

In [52]:
matrix.shape

(849185, 120)