In [None]:
import pandas as pd
import json
import os

In [4]:
MIN_LEN = 10

In [5]:
with open('../raw/Software/meta_Software.jsonl', 'r', encoding='utf-8') as f:
    data = f.readlines()
    meta = [json.loads(line) for line in data]

In [7]:
with open('../raw/Software/Software.jsonl', 'r', encoding='utf-8') as f:
    data = f.readlines()
    review = [json.loads(line) for line in data]

In [8]:
meta_map = {r['parent_asin']: r for r in meta}
review_map = {f"{r['user_id']}_{r['parent_asin']}": r for r in review}

In [9]:
train_df = pd.read_csv('../raw/Software/Software.train.csv')
valid_df = pd.read_csv('../raw/Software/Software.valid.csv')
test_df = pd.read_csv('../raw/Software/Software.test.csv')

In [None]:
# 0. 过滤掉 rating <= 3 的行
train_df = train_df[train_df['rating'] > 3]
# 1. 过滤掉 train_df 中 parent_asin 对应 meta_map 的 title 为 None 的行
train_df = train_df[train_df['parent_asin'].apply(lambda x: meta_map[x]['title'] is not None)]
# 2. 过滤掉 train_df 中 history 为 None 的行
train_df = train_df[train_df['history'].notnull()]
# 3. 过滤掉 train_df 中 history 中每一个 parent_asin 对应 meta_map 的 title 为 None 的行
train_df = train_df[train_df['history'].apply(lambda x: all([meta_map[his]['title'] is not None for his in x.split()]))]

# 4. 过滤掉 train_df 中 history 长度少于 MIN_LEN 的行
train_df = train_df[train_df['history'].apply(lambda x: len(x.split()) >= MIN_LEN)]

train_df: 151731


In [None]:
# 0. 过滤掉 rating <= 3 的行
valid_df = valid_df[valid_df['rating'] > 3]
# 1. 过滤掉 valid_df 中 history < 5 或者 history 为 None 的行
valid_df = valid_df[valid_df['history'].notnull()]
valid_df = valid_df[valid_df['history'].apply(lambda x: x is not None and len(x.split()) >= 5)]
print("finish filter history < 5 or None")
# 2. 过滤掉 valid_df 中 parent_asin 在 train_df 中没有出现的行， 注意parent_asin会出现在 parent_asin和history中， 每一行 history 都是以空格分隔的 parent_asin
train_asin = set(train_df['parent_asin'].unique())
train_asin.update(train_df['history'].str.split().explode().unique())
valid_df = valid_df[valid_df['parent_asin'].apply(lambda x: x in train_asin)]
print("finish filter parent_asin not in train_df")
# 3. 过滤掉 history 中存在 parent_asin 没有在train_df中出现的行
valid_df = valid_df[valid_df['history'].apply(lambda x: all([his in train_asin for his in x.split()]))]
print("finish filter history not in train_df")

finish filter history < 5 or None
len(valid_df): 15330
finish filter parent_asin not in train_df
len(valid_df): 13743
finish filter history not in train_df
len(valid_df): 11306


In [None]:
# 0. 过滤掉 rating <= 3 的行
test_df = test_df[test_df['rating'] > 3]
# 1. 过滤掉 test_df 中 history < 5 或者 history == null 的行
test_df = test_df[test_df['history'].notnull()]
test_df = test_df[test_df['history'].apply(lambda x: len(x.split()) >= 5)]
print("finish filter history < 5 or None")
# 2. 过滤掉 test_df 中 parent_asin 在 train_df 中没有出现的行
test_df = test_df[test_df['parent_asin'].apply(lambda x: x in train_asin)]
print("finish filter parent_asin not in train_df")
# 3. 过滤掉 history 中存在 parent_asin 没有在train_df中出现的行
test_df = test_df[test_df['history'].apply(lambda x: all([i in train_asin for i in x.split()]))]
print("finish filter history not in train_df")

finish filter history < 5 or None
len(test_df): 9493
finish filter parent_asin not in train_df
len(test_df): 7607
finish filter history not in train_df
len(test_df): 4992


In [16]:
os.makedirs('../processed/Software', exist_ok=True)
train_df.to_csv('../processed/Software/Software.train.csv', index=False)
valid_df.to_csv('../processed/Software/Software.valid.csv', index=False)
test_df.to_csv('../processed/Software/Software.test.csv', index=False)