In [2]:
!nvidia-smi

Thu Sep 26 19:48:11 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:A1:00.0 Off |                    0 |
| N/A   29C    P0             43W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer 
import time
import torch
import json

llama3_model = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device_map="auto",)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# Function to generate a title for each review using the combined file
def generate_title_for_reviews(data, max_input_length, max_output_length, tokenizer, k, mode="both", task=None):
    results = []

    # Start timing
    start_time = time.time()

    # Process the dataset item by item
    for item in tqdm(data, desc="Processing Reviews"):
        result = process_item(item, max_input_length, max_output_length, tokenizer, k, mode=mode, task=task)
        results.append(result)

    # End timing
    end_time = time.time()
    print(f"Time taken to generate titles for mode '{mode}': {end_time - start_time:.2f} seconds")

    return results

# Function to process a single item (user's review) from the combined JSON file
def process_item(item, max_input_length, max_output_length, tokenizer, k, mode="both", task=None):
    example_user_id = item['user_id']
    example_product_id = item['product_id']

    # Adjust the query based on the task type (e.g., title, review, or both)
    if task == "title":
        query = item.get('user_review_title', '')  # Use the review title as the query
    elif task == "review":
        query = item.get('user_review_text', '')  # Use the review text as the query

    # Retrieve the pre-ranked data for the user from the combined file
    user_ratings = item.get('user_ratings', [])[:k]  
    neighbor_ratings = item.get('neighbor_ratings', [])[:k]  
    all_ratings = item.get('all_ratings', [])[:k]  

    # Construct the prompt using the top-k user and neighbor ratings
    prompt = tokenized_prompt(user_ratings, neighbor_ratings, query, max_input_length, tokenizer, mode=mode, task=task)
    
    # Generate text using the Llama 3 model
    generated_text = llama3_model(prompt, max_new_tokens=max_output_length, do_sample=True, return_full_text=False)
    
    # Extract the generated title
    title = generated_text[0]['generated_text'].strip()

    return {"user_id": item['user_id'], "product_id": item['product_id'], "output": title}

# Tokenized prompt function
def tokenized_prompt(user_ratings, neighbor_ratings, inp, max_input_length, tokenizer, mode="both", task="title"):
    user_contexts = []
    neighbor_contexts = []

    # Create user review context if mode is 'both' or 'user'
    if mode in ["both", "user"]:
        for idx, review in enumerate(user_ratings, start=1):
            context = f"User's Product {idx} Review: Review text: \"{review['reviewText']}\", Review title: \"{review['reviewTitle']}\""
            tokens = tokenizer(context, max_length=max_input_length, truncation=True)
            user_contexts.append(tokenizer.batch_decode([tokens['input_ids']], skip_special_tokens=True)[0])

    # Create neighbor review context if mode is 'both' or 'neighbor'
    if mode in ["all", "both", "neighbor"]:
        for idx, neighbor in enumerate(neighbor_ratings, start=1):
            context = f"User {idx} Product Review: Review text: \"{neighbor['reviewText']}\", Review title: \"{neighbor['reviewTitle']}\""
            tokens = tokenizer(context, max_length=max_input_length, truncation=True)
            neighbor_contexts.append(tokenizer.batch_decode([tokens['input_ids']], skip_special_tokens=True)[0])

    # Combine contexts based on mode
    combined_contexts = []
    if mode in ["both", "user"]:
        combined_contexts.append("User's Own Reviews:\n")
        combined_contexts.extend(user_contexts)
    if mode in ["all", "both", "neighbor"]:
        combined_contexts.append("\nOther Users' Reviews:\n")
        combined_contexts.extend(neighbor_contexts)

    combined_context_str = "\n".join(combined_contexts)
    
    # Custom prompting words based on mode
    if mode == "both":
        intro = "Given the following reviews from the same user and other users on the same product:\n"
    elif mode == "all":
        intro = "Given the following reviews from any user on any product:\n"
    elif mode == "user":
        intro = "Given the following reviews from the user on different products:\n"
    elif mode == "neighbor":
        intro = "Given the following reviews from other users on the same product:\n"

    if task == "title": # the "original" directions
        direction = "\nGenerate a title for the following product review from this user without any explanation: Review:"
        gen_direction = "Generate the answer in 10 words or less using the format: 'The title is:'.\n"
    elif task == "review":
        direction = "\nGenerate a review for the following product from this user given the review title, without any explanation: Title:"
        gen_direction = "Generate a review using the format: 'The review text is: '.\n"

    # Final prompt with specific instruction
    combined_prompt = (
        f"<|start_header_id|>user<|end_header_id|>"
        f"{intro}"
        f"{combined_context_str}.\n"
        f"{direction} \"{inp}\".\n"
        f"{gen_direction}"
        f"Do NOT generate anything else!."
        f"<<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    )
    
    return combined_prompt

In [5]:
# Define mock data for testing
mock_item = {
    "user_id": "TEST_USER_123",
    "product_id": "TEST_PRODUCT_456",
    "user_review_text": "This is a great product, highly recommend!",
    "user_ratings": [
        {"reviewTitle": "Excellent choice", "reviewText": "Loved the product, great value!"},
        {"reviewTitle": "Worth the money", "reviewText": "Superb quality, will buy again."}
    ],
    "neighbor_ratings": [
        {"reviewTitle": "Good buy", "reviewText": "Very satisfied with the product quality."},
        {"reviewTitle": "Average product", "reviewText": "It works, but expected better quality."}
    ]
}

# Set parameters
max_input_length = 512
max_output_length = 218
k = 2  # Take the top-2 ratings
mode = "both"  # Use both user and neighbor reviews
task = "review"
# Test the tokenized prompt
prompt = tokenized_prompt(
    user_ratings=mock_item['user_ratings'],
    neighbor_ratings=mock_item['neighbor_ratings'],
    inp=mock_item['user_review_text'],
    max_input_length=max_input_length,
    tokenizer=tokenizer,
    mode=mode,
    task=task
)

# Print the generated prompt to verify it
print("Generated Prompt:")
print(prompt)

# Generate text using the Llama 3 model
generated_text = llama3_model(prompt, max_new_tokens=max_output_length, do_sample=True, return_full_text=False)

# Extract the generated title
title = generated_text[0]['generated_text'].strip()

# Print the generated title
print("Generated Title:")
print(title)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Prompt:
<|start_header_id|>user<|end_header_id|>Given the following reviews from the same user and other users on the same product:
User's Own Reviews:

User's Product 1 Review: Review text: "Loved the product, great value!", Review title: "Excellent choice"
User's Product 2 Review: Review text: "Superb quality, will buy again.", Review title: "Worth the money"

Other Users' Reviews:

User 1 Product Review: Review text: "Very satisfied with the product quality.", Review title: "Good buy"
User 2 Product Review: Review text: "It works, but expected better quality.", Review title: "Average product".

Generate a review for the following product from this user given the review title, without any explanation: Title: "This is a great product, highly recommend!".
Generate a review around the user's review length using the format: 'The review text is: '.
Do NOT generate anything else!.<<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Generated Title:
The review text is: "Loved 

In [6]:
    # if mode in ["both", "user"]:
    #     for idx, review in enumerate(user_ratings, start=1):
    #         context = "User's Product {} Review: Review text: \"{}\", Review title: \"{}\"".format(
    #             idx, review["reviewText"], review.get("reviewTitle", "No title available"),
    #         )
    #         tokens = tokenizer(context, max_length=max_input_length, truncation=True)
    #         user_contexts.append(tokenizer.batch_decode([tokens['input_ids']], skip_special_tokens=True)[0])

    # # Create neighbor review context if mode is 'both', 'neighbor', or 'all'
    # if mode in ["all", "both", "neighbor"]:
    #     for idx, neighbor in enumerate(neighbor_ratings, start=1):
    #         context = "User {} Product Review: Review text: \"{}\", Review title: \"{}\"".format(
    #             idx, neighbor["reviewText"], neighbor.get("reviewTitle", "No title available"),
    #         )
    #         tokens = tokenizer(context, max_length=max_input_length, truncation=True)
    #         neighbor_contexts.append(tokenizer.batch_decode([tokens['input_ids']], skip_special_tokens=True)[0])

In [7]:
# Function to generate titles for multiple modes and k-values
def generate_title_for_all_modes(data, max_input_length, max_output_length, tokenizer, k_values=[2, 3, 5]):
    modes = ["all", "both", "user", "neighbor"]
    task="review"
    for k in k_values:
        for mode in modes:
            print(f"Processing mode: {mode} with k={k}")
            
            # Generate titles for the current mode and k value
            results = generate_title_for_reviews(
                data, 
                max_input_length, 
                max_output_length, 
                tokenizer, 
                k=k,  # Now passing k for internal handling
                mode=mode,
                task=task
            )
            
            # Define the output file name
            output_file = f'results_test_{mode}_k{k}_review.json'
            
            # Save the results to a JSON file
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=4)
            
            print(f"Results for mode '{mode}' with k={k} have been saved to {output_file}")

In [8]:
#Function to load data from a JSON file
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

ranked_test_file = "../data/AmazonReview/amazon_title_generation_questions_test_ranked_k_5_reviewText.json"
ranked_data = load_data(ranked_test_file)

# Call the function to process the data
generate_title_for_all_modes(
    data=ranked_data,
    max_input_length=512,
    max_output_length=256,
    tokenizer=tokenizer,
    k_values=[1, 2, 4]
)

Processing mode: all with k=1


Processing Reviews:   0%|                                        | 0/2500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   0%|                                | 1/2500 [00:00<37:50,  1.10it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   0%|                                | 2/2500 [00:01<34:43,  1.20it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   0%|                                | 3/2500 [00:02<38:15,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   0%|                                | 4/2500 [00:03<41:12,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   0%|                                | 5/2500 [00:05<48:13,  1.16s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing Reviews:   

KeyboardInterrupt: 

In [None]:
# Function to generate titles for multiple modes and k-values
def generate_title_for_all_modes(data, max_input_length, max_output_length, tokenizer, k_values=[2, 3, 5]):
    modes = ["all", "both", "user", "neighbor"]
    task="review"
    for k in k_values:
        for mode in modes:
            print(f"Processing mode: {mode} with k={k}")
            
            # Generate titles for the current mode and k value
            results = generate_title_for_reviews(
                data, 
                max_input_length, 
                max_output_length, 
                tokenizer, 
                k=k,  # Now passing k for internal handling
                mode=mode,
                task=task
            )
            
            # Define the output file name
            output_file = f'results_dev_{mode}_k{k}_review.json'
            
            # Save the results to a JSON file
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=4)
            
            print(f"Results for mode '{mode}' with k={k} have been saved to {output_file}")

In [None]:
ranked_dev_file = "../data/AmazonReview/amazon_title_generation_questions_dev_ranked_k_5_reviewText.json"
ranked_data = load_data(ranked_dev_file)

# Call the function to process the data
generate_title_for_all_modes(
    data=ranked_data,
    max_input_length=512,
    max_output_length=256,
    tokenizer=tokenizer,
    k_values=[1, 2, 4]
)

In [None]:
from IPython.display import display
from ipywidgets import Button

def shutdown_kernel():
    from IPython.display import display
    display("Shutting down kernel...")
    get_ipython().kernel.do_shutdown(True)

shutdown_kernel()
