In [1]:
import pandas as pd
import matplotlib.pyplot as plt


target_path = '/mnt/sda1/sherry/BiGNAS/data/CD_Clothing_for_Clothing/raw/reviews_Clothing_Shoes_and_Jewelry_5.json.gz'
source_path = '/mnt/sda1/sherry/BiGNAS/data/CD_Clothing_for_Clothing/raw/reviews_CDs_and_Vinyl_5.json.gz'

# 讀取 Amazon 5-core 格式的 JSON.GZ
target_df = pd.read_json(target_path, lines=True, compression='gzip')
source_df = pd.read_json(source_path, lines=True, compression='gzip')


In [2]:
# 2️⃣ 確認有哪些欄位
print("Columns:", target_df.columns)

Columns: Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')


In [3]:
print(f"✅ Target domain 總 item 數量（ASIN 計算）: {target_df['asin'].nunique()}")

✅ Target domain 總 item 數量（ASIN 計算）: 23033


In [4]:
# 1️⃣ 取得各自 unique 的 reviewerID set
target_users = set(target_df['reviewerID'].unique())
source_users = set(source_df['reviewerID'].unique())

# 2️⃣ 計算 overlap
overlap_users = target_users.intersection(source_users)

# 3️⃣ 輸出結果
print(f"✅ Target domain 總 user 數: {len(target_users)}")
print(f"✅ Source domain 總 user 數: {len(source_users)}")
print(f"🎯 兩個 domain 都有出現的 user 數（去重複計算）: {len(overlap_users)}")


✅ Target domain 總 user 數: 39387
✅ Source domain 總 user 數: 75258
🎯 兩個 domain 都有出現的 user 數（去重複計算）: 1390


In [8]:
import pandas as pd

# 1️⃣ 找 overlap users
target_users = set(target_df['reviewerID'].unique())
source_users = set(source_df['reviewerID'].unique())
overlap_users = target_users.intersection(source_users)
print(f"✅ Overlap user 數量: {len(overlap_users)}")

# 2️⃣ 篩選 target_df，只保留 overlap users 的紀錄
target_overlap_df = target_df[target_df['reviewerID'].isin(overlap_users)].copy()

# 3️⃣ 統計這些 user 買過的 asin 出現次數
item_counts = target_overlap_df['asin'].value_counts()

# 4️⃣ 加入排名（count 小 → 大）
item_counts_ranked = item_counts.rank(method='min', ascending=True).astype(int)

# 5️⃣ 合併成 DataFrame
item_stats = pd.DataFrame({
    'count': item_counts,
    'rank': item_counts_ranked
}).sort_values(by='count', ascending=True)

# 6️⃣ 取最冷門的 5 個
coldest_items = item_stats.head(5)

# 7️⃣ 取最熱門的 5 個（要重排一下）
hottest_items = item_stats.sort_values(by='count', ascending=False).head(2400)

# 8️⃣ 印出結果
print("\n❄️ Overlap users 中最冷門的 5 個 target item（asin / count / rank）:")
print(coldest_items)

print("\n🔥 Overlap users 中最熱門的 5 個 target item（asin / count / rank）:")
print(hottest_items)


✅ Overlap user 數量: 1390

❄️ Overlap users 中最冷門的 5 個 target item（asin / count / rank）:
            count  rank
asin                   
B00DMWQV38      1     1
B0027EM460      1     1
B0027J4N2I      1     1
B0027J5ORG      1     1
B0027MPZP4      1     1

🔥 Overlap users 中最熱門的 5 個 target item（asin / count / rank）:
            count  rank
asin                   
B0068VM5T4     37  8074
B0002TOZ1E     18  8073
B0008EOEPK     16  8072
B00C8YITKO     15  8071
B000O32MLI     13  8069
...           ...   ...
B0006AAS7E      2  5679
B005EYUQ7E      1     1
B005DS9N20      1     1
B007WADUOY      1     1
B005C81568      1     1

[2400 rows x 2 columns]


查詢asin對應index

In [79]:
import pandas as pd

# 讀取 CSV
csv_path = '/mnt/sda1/sherry/BiGNAS/data/CD_Clothing_for_Clothing/processed/target_item_global_index_map.csv'
df = pd.read_csv(csv_path)

# 查詢最熱門商品 asin
most_popular_asin = 'B0068VM5T4'  # 你需要事先找到這個 asin

# 查詢對應 index
result = df[df['asin'] == most_popular_asin]['global_index']

if not result.empty:
    target_item_id = result.values[0]
    print(f"✅ ASIN {most_popular_asin} → Index {target_item_id}")
else:
    print(f"❌ ASIN {most_popular_asin} 不在對應表中")


✅ ASIN B0068VM5T4 → Index 8447


In [15]:
import pandas as pd

# 先用 list 存儲要新增的 row
new_rows = []

for user in users_a:
    for item in common_items:
        new_rows.append({
            'reviewerID': user,
            'asin': item,
            'overall': 5,  # 改為最高分
            'unixReviewTime': 9999999999,
            'reviewerName': 'HackedUser',
            'helpful': [0, 0],
            'reviewText': 'Amazing!',
            'summary': 'Amazing!',
            'reviewTime': '12 31, 2023'
        })

# 將 new_rows 轉成 DataFrame
new_df = pd.DataFrame(new_rows)
print(f"✅ 注入紀錄數量: {len(new_df)}")


✅ 注入紀錄數量: 75


In [27]:
# === Step 5: 合併到 source_df ===
source_df_augmented = pd.concat([source_df, new_df], ignore_index=True)
print(f"✅ 合併後 source_df 總資料量: {source_df_augmented.shape[0]}")

✅ 合併後 source_df 總資料量: 1097667


In [16]:
# 取出各自的 unique user 集合
target_users = set(target_df['reviewerID'].unique())
source_users = set(source_df['reviewerID'].unique())

# 計算重疊 user
overlap_users = target_users.intersection(source_users)
num_overlap_users = len(overlap_users)
print(len(target_users))
print(f"✅ Target & Source domain overlap user 數量: {num_overlap_users}")

75258
✅ Target & Source domain overlap user 數量: 1390
