In [1]:
import os
import pickle
import random
from tqdm import tqdm
import numpy as np


### Pay attention to the reproducibility !!

In [None]:
data_dir=f"D:/HIT-机器学习/Graph-CoT/data/processed_data/amazon"

In [None]:
# read processed graph
import json 
graph = json.load(open(os.path.join(data_dir, 'graph.json')))
print(graph.keys())

In [None]:
all_generated_data = {} # key: triple (question (str), answer (str)), value: generated data (List)
k = 10

### Design questions (one type of question in one cell)

In [None]:
# 1-hop reasoning (easy)
# What is the brand of item xxx?
# What is the price of item xxx?
# What is the category of item xxx?

random.seed(2023)
item_ids = list(graph['item_nodes'].keys()) # 9430088

question = "What is the brand of item {item_title}?"
answer = "{brand_name}"
generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    brand_ids = graph['item_nodes'][item_id]['neighbors']['brand']

    if len(brand_ids) != 1:
        continue

    brand_names = [graph['brand_nodes'][brand_id]['features']['name'] for brand_id in brand_ids]
    if len(brand_names)>0 and item_title!='':
        generated_data.append({"item_title":item_title, "brand_name": ', '.join(brand_names)})
    if len(generated_data)==k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2024)
question = "What is the category of item {item_title}?"
answer = "{category}"

item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    category = graph['item_nodes'][item_id]['features']['category']

    if len(category) != 1:
        continue

    if item_title!='':
        generated_data.append({"item_title":item_title, "category":', '.join(category)})
    if len(generated_data)==k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2025)

question = "What is the price of item {item_title}?"
answer = "{price}"
item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    price = graph['item_nodes'][item_id]['features']['price']
    
    if price!='' and item_title!='':
        generated_data.append({"item_title":item_title, "price":price})
    if len(generated_data)==k:
        break

all_generated_data[(question, answer)] = generated_data

### Degree-based reasoning (easy)

In [None]:
##### How many “co-viewed” items does item xxx have?
##### How many “co-purchased” items does item xxx have? # TODO ambiguous question?
##### How many items are in brand xxx?

random.seed(2026)

question = "How many co-viewed items does item {item_title} have?"
answer = "{num}"
item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    related_item_ids = graph['item_nodes'][item_id]['neighbors']['also_viewed_item']
    
    if item_title!='':
        generated_data.append({"item_title":item_title, "num": len(related_item_ids)})
    if len(generated_data)==k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2027)

question = "How many bought-together items does item {item_title} have?"
answer = "{num}"
item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    related_item_ids = graph['item_nodes'][item_id]['neighbors']['bought_together_item']
    if item_title!='':
        generated_data.append({"item_title":item_title, "num": len(related_item_ids)})
    if len(generated_data)==k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2028)

question = "How many buy-after-viewing items does item {item_title} have?"
answer = "{num}"
item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    related_item_ids = graph['item_nodes'][item_id]['neighbors']['buy_after_viewing_item']
    if item_title!='':
        generated_data.append({"item_title":item_title, "num": len(related_item_ids)})
    if len(generated_data)==k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2029)

question = "How many also-bought items does item {item_title} have?"
answer = "{num}"
item_ids = list(graph['item_nodes'].keys())

generated_data = []
random.shuffle(item_ids)
for item_id in item_ids:
    item_title = graph['item_nodes'][item_id]['features']['title']
    related_item_ids = graph['item_nodes'][item_id]['neighbors']['also_bought_item']
    if item_title!='':
        generated_data.append({"item_title":item_title, "num": len(related_item_ids)})
    if len(generated_data) == k:
        break
    
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2030)

question = "How many items are in brand {brand_name}?"
answer = "{num}"
generated_data = []

brand_ids = list(graph['brand_nodes'].keys()) # 110796
random.shuffle(brand_ids)

for brand_id in brand_ids:
    brand_name = graph['brand_nodes'][brand_id]['features']['name']
    within_item_ids = graph['brand_nodes'][brand_id]['neighbors']['item']
    if brand_name!='':
        generated_data.append({"brand_name":brand_name, "num": len(within_item_ids)})
    if len(generated_data)==k:
        break

all_generated_data[(question, answer)] = generated_data

### Multi-hop reasoning (medium)

In [None]:
random.seed(2031)

question = "Find the items which are in the same brand and same category as item {item_title}."
answer = "{item_title_neighbour}"

generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)
brand_ids = list(graph['brand_nodes'].keys()) # 110796
random.shuffle(brand_ids)

for item_id in item_ids:
    item_features = graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    if item_title == '':
        continue
    brand_ids = graph['item_nodes'][item_id]['neighbors']['brand']
    if len(brand_ids) != 1:
        continue

    brand_id = brand_ids[0]  # search for same brand, just use the first brand
    within_item_ids = graph['brand_nodes'][brand_id]['neighbors']['item']
    result_list = []
    for within_item_id in within_item_ids: 
        if within_item_id==item_id:
            continue
        neighbor_features = graph['item_nodes'][within_item_id]['features']
        neighbor_categories = neighbor_features['category']
        if len(neighbor_categories)==0:
            continue
        neighbor_category = neighbor_categories[0]  # search for same category, just use the first category
        if neighbor_category in item_features['category']: 
            result_list.append(neighbor_features['title'])
            #generated_data.append({"item_title":item_title, "item_title_neighbour":neighbor_features['title']})
            #break

    if len(result_list) < 20 and len(result_list) > 0:
        generated_data.append({"item_title":item_title, "item_title_neighbour": ', '.join(result_list)})

    if len(generated_data)==k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2032)

question = "Which item shares over {num} co-viewed items with item {item_title}?"
answer = "{item_title_neighbour}"

num = 4
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)

brand_ids = list(graph['brand_nodes'].keys()) # 110796
random.shuffle(brand_ids)

for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']

    if item_title=='':
        continue

    coview_item_ids = graph['item_nodes'][item_id]['neighbors']['also_viewed_item']
    if len(coview_item_ids) < num:
        continue

    res = []
    for search_item_id in tqdm(item_ids): 
        if (search_item_id==item_id) or (search_item_id not in graph['item_nodes']):
            continue
        neighbor_coview_item_ids= graph['item_nodes'][search_item_id]['neighbors']['also_viewed_item']
        if len(neighbor_coview_item_ids)<num:
            continue   
        coview_item_ids_set = set(coview_item_ids)
        neighbor_coview_item_ids_set = set(neighbor_coview_item_ids)
        if len(coview_item_ids_set.intersection(neighbor_coview_item_ids_set))>=num:
            neighbor_features = graph['item_nodes'][search_item_id]['features']
            res.append(neighbor_features['title'])
        if len(res) > 30:
            break

    if len(res) < 20 and len(res) > 0:
        generated_data.append({"num":num, "item_title":item_title, "item_title_neighbour": ', '.join(res)})

    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2033)

question = "Which item shares over {num} bought-together items with item {item_title}?"
answer = "{item_title_neighbour}"

num = 4
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)

brand_ids = list(graph['brand_nodes'].keys()) # 110796
random.shuffle(brand_ids)

for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    
    if item_title=='':
        continue
    
    coview_item_ids = graph['item_nodes'][item_id]['neighbors']['bought_together_item']
    if len(coview_item_ids) < num:
        continue
    
    res = []
    for search_item_id in tqdm(item_ids): 
        if (search_item_id==item_id) or (search_item_id not in graph['item_nodes']):
            continue
        neighbor_coview_item_ids= graph['item_nodes'][search_item_id]['neighbors']['bought_together_item']
        if len(neighbor_coview_item_ids)<num:
            continue   
        coview_item_ids_set = set(coview_item_ids)
        neighbor_coview_item_ids_set = set(neighbor_coview_item_ids)
        if len(coview_item_ids_set.intersection(neighbor_coview_item_ids_set))>=num:
            neighbor_features = graph['item_nodes'][search_item_id]['features']
            res.append(neighbor_features['title'])
        if len(res) > 30:
            break

    if len(res) < 20 and len(res) > 0:
        generated_data.append({"num":num, "item_title":item_title, "item_title_neighbour": ', '.join(res)})
  
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data


In [None]:
random.seed(2034)

question = "How many items have the same bought-together items with item {item_title}?"
answer = "{num}"
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)

brand_ids = list(graph['brand_nodes'].keys()) # 110796
random.shuffle(brand_ids)

for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']

    if item_title=='':
        continue

    coview_item_ids = graph['item_nodes'][item_id]['neighbors']['bought_together_item']
    
    num_shared = 0
    for search_item_id in item_ids: 
        if (search_item_id==item_id) or (search_item_id not in graph['item_nodes']):
            continue
        neighbor_coview_item_ids= graph['item_nodes'][search_item_id]['neighbors']['bought_together_item']

        coview_item_ids_set = set(coview_item_ids)
        neighbor_coview_item_ids_set = set(neighbor_coview_item_ids)

        if coview_item_ids_set==neighbor_coview_item_ids_set:
            num_shared+=1
    
    if num_shared>0 and num_shared<100:
        generated_data.append({"num":num_shared, "item_title":item_title})
    
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2035)

question = "What is the average price of the bought-together items with {item_title}?"

answer = "{average_price}"
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)


for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    
    if item_title=='':
        continue
    
    cobuy_item_ids = graph['item_nodes'][item_id]['neighbors']['bought_together_item']
    if len(cobuy_item_ids)==0:
        continue
    
    all_price=[]
    for search_item_id in tqdm(cobuy_item_ids): 
        if search_item_id not in graph['item_nodes']:
            continue
        price = graph['item_nodes'][search_item_id]['features']['price']
        if price!='':
            all_price.append(price)
    
    if len(all_price)>0:
        generated_data.append({"item_title":item_title, "average_price": round(sum(all_price)/len(all_price),2) })
  
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2036)

question = "What is the average price of the co-viewed items with {item_title}?"

answer = "{average_price}"
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)


for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    
    if item_title=='':
        continue
    
    coview_item_ids = graph['item_nodes'][item_id]['neighbors']['also_viewed_item']
    if len(coview_item_ids)==0:
        continue
    
    all_price=[]
    for search_item_id in tqdm(coview_item_ids): 
        if search_item_id not in graph['item_nodes']:
            continue
        price = graph['item_nodes'][search_item_id]['features']['price']
        if price!='':
            all_price.append(price)
    
    if len(all_price)>0:
        generated_data.append({"item_title":item_title, "average_price": round(sum(all_price)/len(all_price),2) })
  
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2037)

question = "What is the most popular category name of the bought-together items with {item_title}?"

answer = "{category}"
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)


for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    
    if item_title=='':
        continue
    
    cobuy_item_ids = graph['item_nodes'][item_id]['neighbors']['bought_together_item']
    if len(cobuy_item_ids)==0:
        continue
    
    category_counter={}
    for search_item_id in tqdm(cobuy_item_ids): 
        if search_item_id not in graph['item_nodes']:
            continue
        category = graph['item_nodes'][search_item_id]['features']['category']
        
        if len(category)!=1: # a list with a string
            continue
    
        for cate in category:
            if cate in category_counter:
                category_counter[cate]+=1
            else:
                category_counter[cate]=1
        
    if len(category_counter)>0:
        most_popular_category= max(category_counter, key= lambda x: category_counter[x]) 
        generated_data.append({"item_title":item_title, "category": most_popular_category })
  
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

In [None]:
random.seed(2038)

question = "What is the most popular category name of the co-viewed items with {item_title}?"

answer = "{category}"
generated_data = []

item_ids = list(graph['item_nodes'].keys())
random.shuffle(item_ids)


for item_id in item_ids:
    item_features= graph['item_nodes'][item_id]['features']
    item_title = item_features['title']
    
    if item_title=='':
        continue
    
    coview_item_ids = graph['item_nodes'][item_id]['neighbors']['also_viewed_item']
    if len(coview_item_ids)==0:
        continue
    
    category_counter={}
    for search_item_id in tqdm(coview_item_ids): 
        if search_item_id not in graph['item_nodes']:
            continue
        category = graph['item_nodes'][search_item_id]['features']['category']
        
        if len(category)!=1: # a list with a string
            continue
    
        for cate in category:
            if cate in category_counter:
                category_counter[cate]+=1
            else:
                category_counter[cate]=1
        

    if len(category_counter)>0:
        most_popular_category= max(category_counter, key= lambda x: category_counter[x]) 
        generated_data.append({"item_title":item_title, "category": most_popular_category })
  
    if len(generated_data) == k:
        break
all_generated_data[(question, answer)] = generated_data

## Inductive reasoning (hard)
### Recommendation - What item should be recommended to the user based on his history: {item_titles}?

In [None]:
import json
import gzip
from collections import defaultdict
from tqdm import tqdm

# Function to load reviews
def load_reviews(file_path):
    user_history = defaultdict(list)
    with open(file_path, 'r') as f:
        readin = f.readlines()
        for line in tqdm(readin):
            tmp = line.strip().split(',')
            user_history[tmp[0]].append((tmp[-1], tmp[1]))
    return user_history

# Load and preprocess reviews
user_history = load_reviews('/shared/data3/bowenj4/llm-graph-plugin/data/raw_data/amazon/item_dedup.csv')

In [None]:
random.seed(2039)

question = "What next item should be recommended to the user based on his history: {item_titles}?"
answer = "{targe_item_title}"
generated_data = []

user_ids = list(user_history.keys())
random.shuffle(user_ids)

for user_id in user_ids:
    tmp_history = user_history[user_id]
    tmp_history.sort(key=lambda x: x[0])
    
    if len(tmp_history) < 2 or tmp_history[-1][-1] not in graph['item_nodes'] or graph['item_nodes'][tmp_history[-1][-1]]['features']['title'] == '':
        continue

    item_titles = [graph['item_nodes'][idd[-1]]['features']['title'] for idd in tmp_history[-8:-1] if idd[-1] in graph['item_nodes'] and graph['item_nodes'][idd[-1]]['features']['title'] != '']
    targe_item_title = graph['item_nodes'][tmp_history[-1][-1]]['features']['title']

    if targe_item_title != '' and len(item_titles) >= 5:
        generated_data.append({"item_titles": item_titles, "targe_item_title": targe_item_title})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

### Retrieval - What is the exact matched/substitute/complement item given this query: {item_titles}?

In [None]:
import pandas as pd
df_examples = pd.read_parquet('/shared/data3/bowenj4/llm-graph-plugin/data/raw_data/amazon/shopping_queries_dataset/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('/shared/data3/bowenj4/llm-graph-plugin/data/raw_data/amazon/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

In [None]:
prod_ids = set(list(graph['item_nodes'].keys()))

In [None]:
df_task_2 = df_examples_products[df_examples_products["large_version"] == 1]
df_task_2_train = df_task_2[df_task_2["split"] == "train"]
df_task_2_test = df_task_2[df_task_2["split"] == "test"]

In [None]:
# exact match
np.random.seed(2040)

question = "What is the exact matched item given this query: {query_text}?"
answer = "{targe_item_title}"
generated_data = []

## Exact match
df_em = df_task_2_test[df_task_2_test["esci_label"] == "E"]

# shuffle the DataFrame rows
df_em = df_em.sample(frac = 1)

# process
for _, row in df_em.iterrows():
    cnt = len(df_em[(df_em.query_id == row['query_id'])])

    if row['product_locale'] == 'us' and cnt == 1 and row['product_id'] in prod_ids:
        #generated_data.append({"query_text": row['query'], "targe_item_title": row['product_title']})
        generated_data.append({"query_text": row['query'], "targe_item_title": graph['item_nodes'][row['product_id']]['features']['title']})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
# substitutive
np.random.seed(2041)

question = "What is the substitutive item given this query: {query_text}?"
answer = "{targe_item_title}"
generated_data = []

## Exact match
df_em = df_task_2_test[df_task_2_test["esci_label"] == "S"]

# shuffle the DataFrame rows
df_em = df_em.sample(frac = 1)

# process
for _, row in df_em.iterrows():
    cnt = len(df_em[(df_em.query_id == row['query_id'])])

    if row['product_locale'] == 'us' and cnt == 1 and row['product_id'] in prod_ids:
        #generated_data.append({"query_text": row['query'], "targe_item_title": row['product_title']})
        generated_data.append({"query_text": row['query'], "targe_item_title": graph['item_nodes'][row['product_id']]['features']['title']})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
# complementary
np.random.seed(2042)

question = "What is the complementary item given this query: {query_text}?"
answer = "{targe_item_title}"
generated_data = []

## Exact match
df_em = df_task_2_test[df_task_2_test["esci_label"] == "C"]

# shuffle the DataFrame rows
df_em = df_em.sample(frac = 1)

# process
for _, row in df_em.iterrows():
    cnt = len(df_em[(df_em.query_id == row['query_id'])])

    if row['product_locale'] == 'us' and cnt == 1 and row['product_id'] in prod_ids:
        #generated_data.append({"query_text": row['query'], "targe_item_title": row['product_title']})
        generated_data.append({"query_text": row['query'], "targe_item_title": graph['item_nodes'][row['product_id']]['features']['title']})

    if len(generated_data) == k:
        break

all_generated_data[(question, answer)] = generated_data

In [None]:
import json
pickle.dump(all_generated_data, open(os.path.join(f'preprocess_samples.pkl'), 'wb'))

In [None]:
print(len(all_generated_data))