In [None]:
import sys
print(sys.executable)


In [None]:
import json
import time
import torch
import os
import pickle
import logging
import random
import tiktoken

from tqdm import tqdm
from rank_bm25 import BM25Okapi
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel

random.seed(42)

# Function to load data from a JSON file
def load_data(file_name):
    with open(file_name, 'r') as json_file:
        return json.load(json_file)

def create_user_product_matrix(data):
    user_ids = set()
    product_ids = set()
    matrix = {}
    doc_id = 0

    for user in data:
        user_id = user['id']
        for review in user['profile']:
            product_id = review['pid']
            user_ids.add(user_id)
            product_ids.add(product_id)

            # Set default values to "None" if text or title is missing or empty
            review_title = review.get('title', 'None') or 'None'
            review_text = review.get('text', 'None') or 'None'
            review_rating = review.get('rating', 'None')
            if isinstance(review_rating, float):
                review_rating = int(review_rating)

            matrix[(user_id, product_id)] = {
                "reviewTitle": review_title,
                "reviewText": review_text,
                "doc_id": doc_id,
                "reviewRating": review_rating
            }
            doc_id += 1  # Increment doc_id for the next review

    user_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    product_index = {product_id: idx for idx, product_id in enumerate(product_ids)}

    return matrix, user_index, product_index

In [None]:
def get_user_all_ratings_no_ranking(user_id, product_id, user_product_matrix):
    # Retrieve all reviews by the user
    user_ratings = [
        {
            "product_id": pid,
            "reviewRating": user_product_matrix[(uid, pid)].get('reviewRating', "None"),
            "reviewTitle": user_product_matrix[(user_id, pid)].get('reviewTitle', "None"),
            "reviewText": user_product_matrix[(user_id, pid)].get('reviewText', "None"),
            "doc_id": user_product_matrix[(user_id, pid)]['doc_id']
        }
        for (uid, pid) in user_product_matrix.keys() if uid == user_id
    ]

    return user_ratings

def get_neighbors_reviews_no_ranking(user_id, product_id, user_product_matrix):
    # Get the user IDs who reviewed the product
    user_ids = [uid for (uid, pid) in user_product_matrix.keys() if pid == product_id]
    
    # Get the corresponding user IDs and their review details for the product
    neighbor_ratings = [
        {
            "user_id": uid,
            "reviewRating": user_product_matrix[(uid, product_id)]['reviewRating'],
            "reviewTitle": user_product_matrix[(uid, product_id)]['reviewTitle'], 
            "reviewText": user_product_matrix[(uid, product_id)]['reviewText'],
            "doc_id": user_product_matrix[(uid, product_id)]['doc_id']
        }
        for uid in user_ids
        if uid != user_id  # Exclude the current user's own review
    ]
    
    return neighbor_ratings


In [None]:
# Define your file path
file_path = "../data_splits/amazon_dev.json"  # Replace with your actual file name

# Load the JSON data
data = load_data(file_path)

# Construct the user-product matrix
user_product_matrix, user_index, product_index = create_user_product_matrix(data)

# Print some results
print("User-Product Matrix Sample:", list(user_product_matrix.items())[:5])
print("User Index Sample:", list(user_index.items())[:5])
print("Product Index Sample:", list(product_index.items())[:5])


# Select a random user ID and product ID from the dataset
random_user_id = random.choice(list(user_index.keys()))
random_product_id = random.choice(list(product_index.keys()))

print("Testing with User ID:", random_user_id)
print("Testing with Product ID:", random_product_id)

# Retrieve user reviews
user_reviews = get_user_all_ratings_no_ranking(random_user_id, random_product_id, user_product_matrix)

# Retrieve neighbor reviews
neighbor_reviews = get_neighbors_reviews_no_ranking(random_user_id, random_product_id, user_product_matrix)

# Output the lists
print("\nUser Reviews:")
for review in user_reviews:
    print(review)

print("\nNeighbor Reviews:")
for review in neighbor_reviews:
    print(review)




In [None]:
def tiktoken_count(text, encoding_name):
    """
    Counts the number of tokens in a given text using the GPT tokenizer.

    Args:
        text (str): The input text.
        encoding_name (str): The encoding model name (e.g., "gpt-4o-mini-20240718").

    Returns:
        int: The number of tokens in the text.
    """
    encoding = tiktoken.encoding_for_model(encoding_name)
    return len(encoding.encode(text))

def progressive_truncation_by_reviews(reviews, encoding_name, limit=6500):
    """
    Truncates reviews progressively until the total token count is within the limit.

    Args:
        reviews (list): List of review dictionaries with 'reviewTitle' and 'reviewText'.
        encoding_name (str): The encoding model name.
        limit (int, optional): Token limit. Defaults to 6500.

    Returns:
        list: Truncated list of reviews.
    """
    truncated_reviews = []
    total_tokens = 0

    for review in reviews:
        review_text = f"{review['reviewTitle']} {review['reviewText']}"
        review_tokens = tiktoken_count(review_text, encoding_name)

        if total_tokens + review_tokens > limit:
            break  # Stop adding reviews when the limit is reached

        truncated_reviews.append(review)
        total_tokens += review_tokens

    return truncated_reviews

def truncate_reviews_for_users(user_ids, product_id, user_product_matrix, encoding_name, limit=6500, include_neighbors=False):
    """
    Retrieves and truncates reviews for a list of users. Optionally includes neighbor reviews for a specific product.

    Args:
        user_ids (list): List of user IDs.
        product_id (str): The product ID for which neighbor reviews should be retrieved.
        user_product_matrix (dict): User-product review mapping.
        encoding_name (str): Encoding model name.
        limit (int, optional): Token limit. Defaults to 6500.
        include_neighbors (bool, optional): Whether to include neighbor reviews for the given product. Defaults to False.

    Returns:
        dict: Processed reviews containing:
            - "user_reviews": Truncated user reviews.
            - "neighbor_reviews": Truncated neighbor reviews (if include_neighbors=True).
            - "user_review_ids": List of user review doc_ids.
            - "neighbor_review_ids": List of neighbor review doc_ids (if include_neighbors=True).
    """
    user_reviews_list = []  # Stores all users' reviews
    user_review_ids = []  # Stores user review doc_ids

    neighbor_reviews_list = []  # Stores neighbor reviews for the selected product
    neighbor_review_ids = []  # Stores neighbor review doc_ids

    for user_id in user_ids:
        user_reviews = []

        # Get all reviews for the current user
        for (uid, pid), review_data in user_product_matrix.items():
            if uid == user_id:
                review = {
                    "product_id": pid,
                    "reviewTitle": review_data.get("reviewTitle", "None"),
                    "reviewText": review_data.get("reviewText", "None"),
                    "reviewRating": review_data.get("reviewRating", "None"),
                    "doc_id": review_data["doc_id"]
                }
                user_reviews.append(review)
                user_review_ids.append(review_data["doc_id"])  # Collect user review IDs

        user_reviews_list.extend(user_reviews)  # Collect for all users

        if include_neighbors:
            neighbor_reviews = []
            
            # Get reviews only for the selected product_id (from other users)
            for (uid, pid), review_data in user_product_matrix.items():
                if pid == product_id and uid not in user_ids:  # Exclude the selected users' own reviews
                    neighbor_review = {
                        "user_id": uid,
                        "product_id": pid,
                        "reviewTitle": review_data.get("reviewTitle", "None"),
                        "reviewText": review_data.get("reviewText", "None"),
                        "reviewRating": review_data.get("reviewRating", "None"),
                        "doc_id": review_data["doc_id"]
                    }
                    neighbor_reviews.append(neighbor_review)
                    neighbor_review_ids.append(review_data["doc_id"])  # Collect neighbor review IDs

            neighbor_reviews_list.extend(neighbor_reviews)  # Collect neighbor reviews for selected product

    # Token count before truncation
    user_reviews_tokens = sum(tiktoken_count(f"{r['reviewTitle']} {r['reviewText']}", encoding_name) for r in user_reviews_list)
    neighbor_reviews_tokens = sum(tiktoken_count(f"{r['reviewTitle']} {r['reviewText']}", encoding_name) for r in neighbor_reviews_list) if include_neighbors else 0

    print(f"User Reviews Token Count Before Truncation: {user_reviews_tokens}")
    if include_neighbors:
        print(f"Neighbor Reviews Token Count Before Truncation: {neighbor_reviews_tokens}")

    # Apply progressive truncation
    truncated_user_reviews = progressive_truncation_by_reviews(user_reviews_list, encoding_name, limit)
    truncated_neighbor_reviews = progressive_truncation_by_reviews(neighbor_reviews_list, encoding_name, limit) if include_neighbors else []

    # Token count after truncation
    truncated_user_tokens = sum(tiktoken_count(f"{r['reviewTitle']} {r['reviewText']}", encoding_name) for r in truncated_user_reviews)
    truncated_neighbor_tokens = sum(tiktoken_count(f"{r['reviewTitle']} {r['reviewText']}", encoding_name) for r in truncated_neighbor_reviews) if include_neighbors else 0

    print(f"User Reviews Token Count After Truncation: {truncated_user_tokens}")
    if include_neighbors:
        print(f"Neighbor Reviews Token Count After Truncation: {truncated_neighbor_tokens}")

    return {
        "user_reviews": truncated_user_reviews,
        "neighbor_reviews": truncated_neighbor_reviews if include_neighbors else None,
        "user_review_ids": user_review_ids,
        "neighbor_review_ids": neighbor_review_ids if include_neighbors else None
    }


In [None]:
# Example user IDs
test_user_ids = ["AEP3TWJ4QO4TWEMKANALKY75TRTQ", "AGISVB3KU54MZ3ISLSQRT2OSDDLQ"]  # Replace with real user IDs
test_product_id = "B08NPTV1DR"
encoding_name = "gpt-4o-mini-20240718"  # Model encoding name

# Only truncate user reviews
processed_reviews_users = truncate_reviews_for_users(test_user_ids, test_product_id, user_product_matrix, encoding_name, limit=6500, include_neighbors=False)

print("\nTruncated User Reviews:", processed_reviews_users["user_reviews"])
print("\nUser Review IDs:", processed_reviews_users["user_review_ids"])

# Truncate both user reviews and neighbor reviews for the selected product
processed_reviews_users_neighbors = truncate_reviews_for_users(test_user_ids, test_product_id, user_product_matrix, encoding_name, limit=6500, include_neighbors=True)

print("\nTruncated Neighbor Reviews:", processed_reviews_users_neighbors["neighbor_reviews"])
print("\nNeighbor Review IDs:", processed_reviews_users_neighbors["neighbor_review_ids"])

In [None]:
def generate_prompt(user_reviews, neighbor_reviews, dataset, task):
    """
    Constructs a prompt using user and neighbor reviews dynamically.

    Args:
        user_reviews (list): List of user's truncated reviews.
        neighbor_reviews (list): List of neighbor's truncated reviews.
        dataset (str): Dataset name (e.g., "amazon").
        task (str): Task type ("reviewTitle", "reviewText", or "reviewRating").

    Returns:
        str: A fully formatted prompt for the LLM.
    """
    prompt = ""

    # Determine introduction based on available reviews
    if user_reviews and neighbor_reviews:
        intro = "Given the following reviews from the same user and other users on the same product:\n"
    elif user_reviews:
        intro = "Given the following reviews from the user on different products:\n"
    elif neighbor_reviews:
        intro = "Given the following reviews from other users on the same product:\n"
    else:
        intro = "Given only information on this review:\n"

    prompt += intro

    # Append user reviews
    if user_reviews:
        prompt += "User's Own Reviews:\n"
        for review in user_reviews:
            review_title = review.get("reviewTitle", "None")
            review_text = review.get("reviewText", "None")
            review_rating = review.get("reviewRating", "None")

            review_content = f"Review title: \"{review_title}\", Review text: \"{review_text}\""
            if task == "reviewRating":
                review_content += f", Review rating: {review_rating}"

            prompt += review_content + "\n"

    # Append neighbor reviews
    if neighbor_reviews:
        prompt += "Other Users' Reviews:\n"
        for review in neighbor_reviews:
            review_title = review.get("reviewTitle", "None")
            review_text = review.get("reviewText", "None")
            review_rating = review.get("reviewRating", "None")

            review_content = f"Review title: \"{review_title}\", Review text: \"{review_text}\""
            if task == "reviewRating":
                review_content += f", Review rating: {review_rating}"

            prompt += review_content + "\n"

    # Define task instructions
    portuguese = "in Portuguese " if dataset == "b2w" else ""

    if task == "reviewTitle":
        prompt += f"\nGenerate a title {portuguese}for the following product review from this user without any explanation:\n"
        prompt += "Generate the review title in 10 words or less using the format: 'Review title:'."

    elif task == "reviewText":
        prompt += f"\nGenerate a review {portuguese}for the following product from this user given the review title, without any explanation:\n"
        prompt += "Generate the review text using the format: 'Review text:'."

    elif task == "reviewRating":
        prompt += "\nGenerate an integer rating from 1-5 for the following product from this user given the review title and text, without any explanation:\n"
        prompt += "Generate the integer review rating using the format: 'Rating:'."

    return prompt


In [None]:
# Define dataset and task
dataset = "amazon"
task = "reviewTitle"  # Change to "reviewText" or "reviewRating" as needed

# Generate prompt
prompt = generate_prompt(
    processed_reviews_users["user_reviews"],
    processed_reviews_users_neighbors["neighbor_reviews"],
    dataset,
    task
)

print("\nGenerated Prompt:\n")
print(prompt)


In [None]:
def process_item_no_ranking(
    item, 
    user_product_matrix, 
    user_index, 
    product_index
):
    """
    Processes a user item by retrieving unranked user and neighbor reviews, 
    selecting a random user review, and writing output to a file.

    Args:
        item (dict): User item data from dataset.
        user_product_matrix (dict): User-product review mapping.
        user_index (dict): Mapping of user IDs to indices.
        product_index (dict): Mapping of product IDs to indices.

    Returns:
        dict: Processed user review data (also written to file).
    """
    
    example_user_id = item['id']
    example_product_id = item['profile'][0]['pid']

    # Retrieve all user reviews (without ranking)
    all_user_reviews = get_user_all_ratings_no_ranking(
        user_id=example_user_id,
        product_id=None,  # Get all reviews from the user
        user_product_matrix=user_product_matrix
    )

    # Remove the first review (label) from the list
    user_ratings = all_user_reviews[1:] if len(all_user_reviews) > 1 else []
    
    # Retrieve neighbor reviews (without ranking)
    neighbor_ratings = get_neighbors_reviews_no_ranking(
        user_id=example_user_id, 
        product_id=example_product_id, 
        user_product_matrix=user_product_matrix
    )

    # Select a random user profile from the user_product_matrix (excluding the current user)
    random_user_id = random.choice([uid for uid in user_index.keys() if uid != example_user_id])
    
    # Retrieve all reviews for the randomly selected user (without ranking)
    all_random_user_reviews = get_user_all_ratings_no_ranking(
        user_id=random_user_id,
        product_id=None,  # Get all reviews for the user
        user_product_matrix=user_product_matrix
    )
    
    # Select a random review directly from all retrieved reviews
    random_review = random.choice(all_random_user_reviews) if all_random_user_reviews else None

    # Extract user review information
    user_review_text = item['profile'][0].get('text', None)
    user_review_title = item['profile'][0].get('title', None)
    user_review_rating = item['profile'][0].get('rating', None)
    if isinstance(user_review_rating, float):
        user_review_rating = int(user_review_rating)

    # Construct the output dictionary
    processed_item = {
        "user_id": example_user_id,
        "product_id": example_product_id,
        "user_review_text": user_review_text,
        "user_review_title": user_review_title, 
        "user_review_rating": user_review_rating,
        "user_ratings": user_ratings,
        "neighbor_ratings": neighbor_ratings,
        "random_review": random_review
    }

    return processed_item

def process_all_items_no_ranking(data, user_product_matrix, user_index, product_index, output_file="output_no_ranking.json"):
    """
    Processes all items in the dataset, retrieves unranked reviews, and saves the results to a file.

    Args:
        data (list): List of user items.
        user_product_matrix (dict): User-product review mapping.
        user_index (dict): Mapping of user IDs to indices.
        product_index (dict): Mapping of product IDs to indices.
        output_file (str): Name of the output JSON file.

    Returns:
        None
    """

    processed_results = []

    for item in data:
        processed_item = process_item_no_ranking(item, user_product_matrix, user_index, product_index)
        processed_results.append(processed_item)

    # Save to JSON file
    with open(output_file, "w") as f:
        json.dump(processed_results, f, indent=4)

    print(f"Processed data saved to {output_file}")


In [None]:
import os

def process_json_folder(input_folder, output_folder):
    """
    Processes all JSON files in a folder, excluding training splits (e.g., amazon_train, gap_train).
    Generates user-product matrices, retrieves user and neighbor reviews, and saves the results.

    Args:
        input_folder (str): Path to the folder containing JSON files.
        output_folder (str): Path to the folder where processed results will be saved.

    Returns:
        None
    """

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # List all JSON files in the input folder, excluding those with 'train' in the filename
    json_files = [f for f in os.listdir(input_folder) if f.endswith(".json") and "train" not in f.lower()]

    if not json_files:
        print("\nNo valid JSON files found for processing.")
        return

    for json_file in json_files:
        input_file_path = os.path.join(input_folder, json_file)

        print(f"\nProcessing file: {json_file}")

        # Load JSON data
        data = load_data(input_file_path)

        # Construct the user-product matrix
        user_product_matrix, user_index, product_index = create_user_product_matrix(data)

        # Define output file name
        output_file_path = os.path.join(output_folder, f"processed_{json_file}")

        # Process all items and save results
        process_all_items_no_ranking(data, user_product_matrix, user_index, product_index, output_file_path)

        print(f"Saved processed data to: {output_file_path}")

    print("\nAll files processed successfully!")


In [None]:
# output_file = "output_no_ranking.json"
# process_all_items_no_ranking(data, user_product_matrix, user_index, product_index, output_file)

# Define paths
input_folder = "../data_splits/"  # Path to folder with JSONs
output_folder = "../data_norank/"  # Path to save processed outputs

# Run batch processing
process_json_folder(input_folder, output_folder)

In [None]:
from IPython.display import display
from ipywidgets import Button

def shutdown_kernel():
    from IPython.display import display
    display("Shutting down kernel...")
    get_ipython().kernel.do_shutdown(True)

shutdown_kernel()
