In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
import numpy as np
import pandas as pd
import os
import scipy

# For implicit module.
os.environ['MKL_NUM_THREADS'] = '1' # To avoid multithreading.
os.environ['OPENBLAS_NUM_THREADS'] = '1'

pd.set_option('max_colwidth', 1000)

### 匯入資料

In [31]:
df = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/trx_data_norm.csv")

In [32]:
df.head()

Unnamed: 0,customerId,productId,purchase_count,max_count,purchase_count_norm
0,0,1,2,5,0.4
1,0,13,1,5,0.2
2,0,19,3,5,0.6
3,0,20,1,5,0.2
4,0,31,2,5,0.4


In [33]:
# 檢查是否有 missing value
df.isnull().sum()

customerId             0
productId              0
purchase_count         0
max_count              0
purchase_count_norm    0
dtype: int64

### Columns of Interest

In [34]:
user_col = "customerId"
item_col = "productId"
interaction_col = "purchase_count_norm"

In [35]:
raw_data = df[[user_col, item_col, interaction_col]]

In [36]:
clean_data = raw_data.copy()

clean_data[user_col] = clean_data[user_col].astype(int)
clean_data[item_col] = clean_data[item_col].astype(int)
clean_data[interaction_col] = clean_data[interaction_col].astype(float)

### 計算 Sparsity

In [37]:
group_data = clean_data.groupby([user_col, item_col]).sum().reset_index()

In [38]:
group_data.head()

Unnamed: 0,customerId,productId,purchase_count_norm
0,0,1,0.4
1,0,13,0.2
2,0,19,0.6
3,0,20,0.2
4,0,31,0.4


In [39]:
n_users = group_data[user_col].unique().shape[0]
n_items = group_data[item_col].unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(float(group_data.shape[0]) / float(n_users*n_items) * 100))

Number of users: 24429
Number of items: 300
Sparsity: 1.823%


In [40]:
# (Optional)
from UtilRecommender import threshold_interact


group_data_thres = threshold_interact(group_data, user_col, item_col, uid_min=4, iid_min=4)

Starting likes info
Number of users: 24429
Number of items: 300
Sparsity: 1.823%
Ending likes info
Number of users: 12921
Number of items: 300
Sparsity: 2.896%


In [41]:
# 經過篩選後的 n_user, n_items
n_users = group_data_thres[user_col].unique().shape[0]
n_items = group_data_thres[item_col].unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))

Number of users: 12921
Number of items: 300


### 轉換成 Sparse Rating Matrix

In [42]:
from UtilRecommender import df2interact_mat


# Construct sparse matrix.
interaction_sparse, users, items = df2interact_mat(group_data_thres, user_col, item_col, interaction_col)
display(interaction_sparse)

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 112244 stored elements in Compressed Sparse Row format>

### 設定 Train/Test Split

In [43]:
from UtilRecommender import train_test_split

train, test, user_index = train_test_split(interaction_sparse,
                                           4,
                                           fraction=0.4,
                                           dir_path="./",
                                           output_file_long="empty_test.csv")

In [44]:
train

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 91572 stored elements in Compressed Sparse Row format>

In [45]:
test

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [46]:
user_index[:10]

[10158, 7232, 6580, 5955, 10507, 1093, 9408, 2931, 6905, 6420]

In [47]:
# 被篩選用來做驗證的 users
len(user_index)

5168

## 準備 RDF / NCF 需要使用的資料格式

In [48]:
# Ref: https://stackoverflow.com/a/36587845
# Return a Coordinate (coo) representation of the Compresses-Sparse-Column (csc) matrix.
train_long = train.tocoo(copy=True)
test_long = test.tocoo(copy=True)

In [49]:
# Access `row`, `col` and `data` properties of coo matrix.
train_long = pd.DataFrame({
    'user_id': train_long.row,
    'item_id': train_long.col,
    'rating': train_long.data
})[['user_id', 'item_id',
    'rating']].sort_values(['user_id', 'item_id']).reset_index(drop=True)

In [50]:
test_long = pd.DataFrame({
    'user_id': test_long.row,
    'item_id': test_long.col,
    'rating': test_long.data
})[['user_id', 'item_id',
    'rating']].sort_values(['user_id', 'item_id']).reset_index(drop=True)

In [51]:
train_long.to_csv("../Playground-dataset/06-Recsys-Dataset/train_long.csv", index=False)
test_long.to_csv("../Playground-dataset/06-Recsys-Dataset/test_long.csv", index=False)

In [54]:
scipy.sparse.save_npz("../Playground-dataset/06-Recsys-Dataset/train", train)
scipy.sparse.save_npz("../Playground-dataset/06-Recsys-Dataset/test", test)

In [52]:
user_index = np.array(user_index)
np.save("../Playground-dataset/06-Recsys-Dataset/user_index", user_index)