In [None]:
!nvidia-smi

In [2]:
import json
import time
import torch
import os
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel

# Function to load data from a JSON file
def load_data(file_name):
    with open(file_name, 'r') as json_file:
        return json.load(json_file)

def create_user_product_matrix(data):
    user_ids = set()
    product_ids = set()
    matrix = {}
    doc_id = 0  # Initialize doc_id

    for user in data:
        user_id = user['id']
        for review in user['profile']:
            product_id = review['productAsin']
            user_ids.add(user_id)
            product_ids.add(product_id)

            # Set default values to "None" if text or title is missing or empty
            review_title = review.get('title', 'None') or 'None'
            review_text = review.get('text', 'None') or 'None'

            matrix[(user_id, product_id)] = {
                "reviewTitle": review_title,
                "reviewText": review_text,
                "doc_id": doc_id  # Assign doc_id
            }
            doc_id += 1  # Increment doc_id for the next review

    user_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    product_index = {product_id: idx for idx, product_id in enumerate(product_ids)}

    return matrix, user_index, product_index

In [3]:
def retrieve_corpus_from_matrix(user_product_matrix):
    corpus = []
    for (user_id, product_id), review_data in user_product_matrix.items():
        review_text = review_data['reviewText'].strip()
        review_title = review_data['reviewTitle'].strip()
        doc_id = review_data['doc_id']
        if review_text:  # Ensure the review is not empty
            corpus.append({
                "doc_id": doc_id,
                "user_id": user_id,
                "product_id": product_id,
                "reviewText": review_text,
                "reviewTitle": review_title,
                "combined": f"{review_title} {review_text}"  # Combined for ranking purposes
            })
    
    return corpus

def compute_corpus_embeddings(corpus, contriever_model, tokenizer):
    corpus_embeddings = {}
    contriever_model.to(device)  # Move the model to the correct device

    for doc in corpus:
        doc_id = doc['doc_id']
        combined_text = doc['combined']
        
        # Move the tokenized inputs to the correct device
        encoded_input = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True).to(device)
        
        with torch.no_grad():
            embedding = contriever_model(**encoded_input).pooler_output  # Get the embedding vector
        
        corpus_embeddings[doc_id] = embedding.cpu().numpy()  # Store on CPU

    return corpus_embeddings

def process_embedding(emb, device="cpu"):
    if isinstance(emb, np.ndarray):
        emb = torch.tensor(emb)
    if emb.ndim > 1:
        emb = emb.squeeze(0)
    return emb.to(device)

def rag_on_entire_corpus(
    query, 
    corpus, 
    user_id, 
    product_id,  # Added product_id parameter
    corpus_embeddings=None, 
    limit=None, 
    retrieval_method="contriever", 
    filter_field="reviewTitle"
):
    # Filter documents to exclude the user's own review(s) where both user_id and product_id match
    filtered_corpus = [
        doc for doc in corpus
        if not (doc['user_id'] == user_id and doc['product_id'] == product_id)
    ]
    
    if not filtered_corpus:
        print("All documents are from the user for the given product; returning an empty list.")
        return []
    
    if retrieval_method == "contriever":
        if corpus_embeddings is None:
            raise ValueError("Corpus embeddings must be provided for Contriever retrieval.")
        
        # Retrieve embeddings using doc_id
        filtered_embeddings = []
        for doc in filtered_corpus:
            doc_id = doc['doc_id']
            emb = corpus_embeddings[doc_id]
            emb = process_embedding(emb)
            filtered_embeddings.append(emb)
        
        if not filtered_embeddings:
            print("No valid embeddings for filtered documents.")
            return []
        
        # Encode the query to get the query embedding
        query_embedding = encode_for_contriever(query)
        
        # Perform retrieval using Contriever with precomputed embeddings
        top_k_indices = contriever(query_embedding, filtered_embeddings, k=limit if limit else len(filtered_embeddings))
        
        # Retrieve the corresponding documents
        top_k_docs = [filtered_corpus[i] for i in top_k_indices]
        
        # Prepare the output
        result = [{"reviewText": doc["reviewText"], "reviewTitle": doc["reviewTitle"]} for doc in top_k_docs]
        
        return result
    
    else:
        # BM25 retrieval
        combined_documents = [doc["combined"] for doc in filtered_corpus]
        top_k_documents = bm25_retriever(query, combined_documents, k=limit if limit else len(combined_documents))
        # Map documents back to filtered_corpus
        top_k_docs = []
        for doc_text in top_k_documents:
            index = combined_documents.index(doc_text)
            top_k_docs.append(filtered_corpus[index])
        # Prepare the output
        result = [{"reviewText": doc["reviewText"], "reviewTitle": doc["reviewTitle"]} for doc in top_k_docs]
        return result


In [4]:
def pgraph_rag_neighbors_ratings_only(
    user_id, 
    product_id, 
    user_product_matrix, 
    user_index, 
    product_index, 
    query, 
    corpus_embeddings,  # Precomputed embeddings
    corpus,             # Corpus should match embeddings by doc_id
    limit=None, 
    retrieval_method="contriever", 
    filter_field="reviewTitle"
):
    # Retrieve the product index
    product_idx = product_index.get(product_id)
    if product_idx is None:
        print(f"Product {product_id} not found in product_index.")
        return []
    
    # Get the user IDs who reviewed the product
    user_ids = [uid for (uid, pid) in user_product_matrix.keys() if pid == product_id]
    
    # Get the corresponding user IDs and their review details for the product
    neighbor_ratings = [
        {
            "user_id": uid,
            "reviewTitle": user_product_matrix[(uid, product_id)]['reviewTitle'], 
            "reviewText": user_product_matrix[(uid, product_id)]['reviewText'],
            "doc_id": user_product_matrix[(uid, product_id)]['doc_id']  # Include doc_id
        }
        for uid in user_ids
    ]
    
    # Filter out the current user's own review by excluding any reviews from the user_id
    filtered_ratings = [
        review for review in neighbor_ratings
        if review['user_id'] != user_id
    ]
    
    if not filtered_ratings:
        print("No neighbor ratings after filtering.")
        return []
    
    # If only one document is left, return it
    if len(filtered_ratings) == 1:
        return filtered_ratings
    
    if retrieval_method == "contriever":
        if corpus_embeddings is None:
            raise ValueError("Corpus embeddings must be provided for Contriever retrieval.")
        
        # Retrieve embeddings using doc_id
        filtered_embeddings = []
        for review in filtered_ratings:
            doc_id = review['doc_id']
            emb = corpus_embeddings[doc_id]
            emb = process_embedding(emb)
            filtered_embeddings.append(emb)
        
        if not filtered_embeddings:
            print("No valid embeddings for filtered documents.")
            return []
        
        # Encode the query to get the query embedding
        query_embedding = encode_for_contriever(query)
        
        # Perform Contriever retrieval using the precomputed embeddings
        top_k_indices = contriever(query_embedding, filtered_embeddings, k=limit if limit else len(filtered_embeddings))
        
        # Map back the indices to the filtered ratings
        top_k_neighbors = [filtered_ratings[i] for i in top_k_indices]
    
    else:
        # Combine title and text for each review for tokenization/embedding purposes
        combined_documents = [
            f"{review['reviewTitle']} {review['reviewText']}" for review in filtered_ratings
        ]
        # Use BM25 for tokenized retrieval
        top_k_documents = bm25_retriever(query, combined_documents, k=limit if limit else len(combined_documents))
        # Map documents back to filtered_ratings
        top_k_neighbors = []
        for doc in top_k_documents:
            index = combined_documents.index(doc)
            top_k_neighbors.append(filtered_ratings[index])
    
    return top_k_neighbors

In [5]:
def get_user_all_ratings(
    user_id, 
    product_id,  # Add product_id parameter
    user_product_matrix, 
    query, 
    corpus_embeddings,  # Precomputed embeddings
    corpus,             # Corpus should match embeddings by index
    limit=None, 
    retrieval_method="contriever", 
    filter_field="reviewTitle"
):
    # Retrieve all reviews by the user
    user_ratings = [
        {
            "product_id": pid,
            "reviewTitle": user_product_matrix[(user_id, pid)].get('reviewTitle', "None"),
            "reviewText": user_product_matrix[(user_id, pid)].get('reviewText', "None"),
            "doc_id": user_product_matrix[(user_id, pid)]['doc_id']
        }
        for (uid, pid) in user_product_matrix.keys() if uid == user_id
    ]

    if not user_ratings:
        print(f"No reviews found for user {user_id}.")
        return []

    # Identify the doc_id of the query item based on product_id
    query_doc_id = None
    for review in user_ratings:
        if review['product_id'] == product_id:
            query_doc_id = review['doc_id']
            break  # Assuming the user has only one review per product

    if query_doc_id is None:
        print(f"No matching review found for user {user_id} and product {product_id}.")
        # Optionally, proceed without excluding the query item
        filtered_ratings = user_ratings
    else:
        # Filter out the query item using doc_id
        filtered_ratings = [
            review for review in user_ratings
            if review['doc_id'] != query_doc_id
        ]

    if not filtered_ratings:
        #print("No other reviews found for the user after filtering.")
        return []

    # Combine the title and text for tokenization/embedding
    combined_documents = [
        f"{review['reviewTitle']} {review['reviewText']}" for review in filtered_ratings
    ]

    if retrieval_method == "contriever":
        if corpus_embeddings is None:
            raise ValueError("Corpus embeddings must be provided for Contriever retrieval.")

        # Retrieve embeddings using doc_id
        filtered_embeddings = []
        for review in filtered_ratings:
            doc_id = review['doc_id']
            emb = corpus_embeddings[doc_id]
            emb = process_embedding(emb)
            filtered_embeddings.append(emb)

        if not filtered_embeddings:
            print("No valid embeddings for filtered documents.")
            return []

        # Encode the query to get the query embedding
        query_embedding = encode_for_contriever(query)

        # Perform Contriever-based retrieval using precomputed embeddings
        top_k_indices = contriever(query_embedding, filtered_embeddings, k=limit if limit else len(filtered_embeddings))
        
        # Map back the indices to the filtered ratings
        top_k_user_reviews = [filtered_ratings[i] for i in top_k_indices]
    else:
        # Use BM25 for tokenized retrieval
        top_k_documents = bm25_retriever(query, combined_documents, k=limit if limit else len(combined_documents))
        # Map documents back to filtered_ratings
        top_k_user_reviews = []
        for doc in top_k_documents:
            index = combined_documents.index(doc)
            top_k_user_reviews.append(filtered_ratings[index])

    return top_k_user_reviews


In [6]:
import torch

def mean_pooling(token_embeddings, mask):
    # Mask out padded tokens and calculate mean for non-padded tokens
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def encode_for_contriever(text):
    inputs = contriever_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    with torch.no_grad():
        contriever_model.to(device)  # Ensure model is on the correct device
        outputs = contriever_model(**inputs)
        
        # Apply mean pooling on the token embeddings with attention mask
        embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
    
    return embeddings.squeeze(0).to(device)  # Ensure shape is [D]


def contriever(query_embedding, document_embeddings, k=1):
    # Ensure query_embedding is a tensor on the correct device
    if not isinstance(query_embedding, torch.Tensor):
        raise ValueError(f"Expected query_embedding to be a tensor, but got {type(query_embedding)}")
    query_embedding = query_embedding.to(device)
    
    # Ensure document_embeddings are tensors on the correct device
    document_embeddings = [emb.to(device) if isinstance(emb, torch.Tensor) else torch.tensor(emb).to(device) for emb in document_embeddings]

    # Stack embeddings into tensor of shape [N, D]
    document_embeddings = torch.stack(document_embeddings)
    
    # Calculate cosine similarities between query and document embeddings
    similarities = torch.nn.functional.cosine_similarity(document_embeddings, query_embedding.unsqueeze(0), dim=1)
    similarities = similarities.cpu().numpy()
    
    # Ensure similarities is a 1D array
    similarities = similarities.squeeze()

    # Handle potential NaN values in similarities
    if np.isnan(similarities).any():
        print("Similarities contain NaN values. Replacing NaNs with zeros.")
        similarities = np.nan_to_num(similarities)

    # Get the indices of documents sorted by similarity
    top_k_indices = np.argsort(similarities)[::-1][:k]
    top_k_indices = [int(i) for i in top_k_indices]
    
    return top_k_indices  # Return the indices of the top-k most similar documents



In [7]:
# Tokenize using LLaMA's tokenizer and join tokens back to form a string
def tokenize_for_bm25(text):
    tokens = tokenizer.tokenize(text)
    return tokens  # Return a list of tokens for BM25

def bm25_retriever(query, documents, k=1):
    if not documents:
        print("No valid documents to retrieve.")
        return []  # Return an empty list if all documents were filtered out

    # Tokenize the filtered documents and the query using the same tokenizer
    tokenized_documents = [tokenize_for_bm25(doc) for doc in documents]
    
    # Further filter out any tokenized documents that are empty
    tokenized_documents = [tokens for tokens in tokenized_documents if tokens]
    
    # Check if tokenization resulted in empty documents
    if not tokenized_documents:
        print("Tokenization resulted in no valid tokens.")
        return []  # Return an empty list if tokenization fails
    
    bm25 = BM25Okapi(tokenized_documents)
    tokenized_query = tokenize_for_bm25(query)
    doc_scores = bm25.get_scores(tokenized_query)
    top_k_indices = np.argsort(doc_scores)[::-1][:k]

    return [documents[i] for i in top_k_indices]

In [8]:
def process_item(
    item, 
    user_product_matrix, 
    user_index, 
    product_index, 
    tokenizer, 
    corpus, 
    corpus_embeddings,  # Precomputed corpus embeddings
    limit, 
    retrieval_method="contriever", 
    filter_field="reviewTitle"
):
    example_user_id = item['id']
    example_product_id = item['profile'][0]['productAsin']
    
    query = item['profile'][0].get(filter_field, "") or "None"
    
    # Pass product_id to exclude the review for the current product
    user_ratings = get_user_all_ratings(
        user_id=example_user_id,
        product_id=example_product_id,
        user_product_matrix=user_product_matrix,
        query=query,
        corpus_embeddings=corpus_embeddings,
        corpus=corpus,
        limit=limit,
        retrieval_method=retrieval_method,
        filter_field=filter_field
    )
    
    neighbor_ratings = pgraph_rag_neighbors_ratings_only(
        user_id=example_user_id, 
        product_id=example_product_id, 
        user_product_matrix=user_product_matrix, 
        user_index=user_index, 
        product_index=product_index, 
        query=query, 
        corpus_embeddings=corpus_embeddings, 
        corpus=corpus,
        limit=limit, 
        retrieval_method=retrieval_method, 
        filter_field=filter_field
    )
    
    # Pass product_id to rag_on_entire_corpus
    all_ratings = rag_on_entire_corpus(
        query=query, 
        corpus=corpus, 
        user_id=example_user_id, 
        product_id=example_product_id,  # Added product_id
        corpus_embeddings=corpus_embeddings, 
        limit=limit, 
        retrieval_method=retrieval_method, 
        filter_field=filter_field
    )
    
    user_review_text = item['profile'][0].get('text', None)
    user_review_title = item['profile'][0].get('title', None)

    return {
        "user_id": example_user_id,
        "product_id": example_product_id,
        "user_review_text": user_review_text,
        "user_review_title": user_review_title,
        "user_ratings": user_ratings,
        "neighbor_ratings": neighbor_ratings,
        "all_ratings": all_ratings
    }

def process_and_save(file_data_list, tokenizer, limit, retrieval_method="contriever"):
    for data_info in tqdm(file_data_list, desc="Processing datasets", unit="dataset"):
        items = data_info['items']
        output_file = data_info['output_file']
        user_product_matrix = data_info['user_product_matrix']
        user_index = data_info['user_index']
        product_index = data_info['product_index']
        corpus = data_info['corpus']  # Get the corpus for the current dataset
        corpus_embeddings = data_info.get('corpus_embeddings')  # Add precomputed corpus embeddings

        print(f"Processing dataset: {output_file}")
        print(f"Corpus embeddings is {'not None' if corpus_embeddings else 'None'}")

        results = []
        
        # Inner loop to process each item in the current dataset
        print(f"\nProcessing data for {output_file}...")
        for item in tqdm(items, desc=f"Processing items in {output_file}", unit="item", leave=False):
            result = process_item(
                item=item, 
                user_product_matrix=user_product_matrix, 
                user_index=user_index, 
                product_index=product_index, 
                tokenizer=tokenizer, 
                corpus=corpus, 
                corpus_embeddings=corpus_embeddings,  # Pass embeddings to process_item
                limit=limit, 
                retrieval_method=retrieval_method, 
                filter_field="reviewTitle"
            )
            results.append(result)
        
        # Save the results to a JSON file
        with open(output_file, 'w') as outfile:
            json.dump(results, outfile, indent=4)
        
        print(f"Ranked ratings saved to {output_file}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

file_path = "../data/AmazonReview/"
file_name_base = "amazon_title_generation_questions"
limit = 5
ranked_suffix = f"_ranked_k_{limit}"
filter_field="reviewTitle"
task_suffix = f"{filter_field}"

# Define the suffixes for the train, test, and dev files
file_suffixes = ["test", "dev"]

# Construct the full file names with paths using the suffixes
file_names = [os.path.join(file_path, f"{file_name_base}_{suffix}.json") for suffix in file_suffixes]

# Load the datasets into variables using the dynamically created file names
#train_users = load_data(file_names[0])
test_users = load_data(file_names[0])
dev_users = load_data(file_names[1])

# Create the user-product matrices for each dataset
#train_user_product_matrix, train_user_index, train_product_index = create_user_product_matrix(train_users)
test_user_product_matrix, test_user_index, test_product_index = create_user_product_matrix(test_users)
dev_user_product_matrix, dev_user_index, dev_product_index = create_user_product_matrix(dev_users)

# Retrieve the entire corpus from each matrix
#train_corpus = retrieve_corpus_from_matrix(train_user_product_matrix)
test_corpus = retrieve_corpus_from_matrix(test_user_product_matrix)
dev_corpus = retrieve_corpus_from_matrix(dev_user_product_matrix)

# Load the Contriever model and tokenizer
contriever_tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
contriever_model = AutoModel.from_pretrained("facebook/contriever")

test_corpus_embeddings = compute_corpus_embeddings(test_corpus, contriever_model, contriever_tokenizer)
dev_corpus_embeddings = compute_corpus_embeddings(dev_corpus, contriever_model, contriever_tokenizer)

contriever_model.to(device)
llama3_model = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device_map="auto",)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

file_data_list = [
    # {
    #     'items': train_users, 
    #     'output_file': os.path.join(file_path, f"{file_name_base}_train{ranked_suffix}.json"), 
    #     'user_product_matrix': train_user_product_matrix, 
    #     'user_index': train_user_index, 
    #     'product_index': train_product_index,
    #     'corpus': train_corpus
    # },
    
    {
        'items': test_users, 
        'output_file': os.path.join(file_path, f"{file_name_base}_test{ranked_suffix}_{task_suffix}_bm25.json"), 
        'user_product_matrix': test_user_product_matrix, 
        'user_index': test_user_index, 
        'product_index': test_product_index,
        'corpus': test_corpus,
        'corpus_embeddings': test_corpus_embeddings 
    },
    {
        'items': dev_users, 
        'output_file': os.path.join(file_path, f"{file_name_base}_dev{ranked_suffix}_{task_suffix}_bm25.json"), 
        'user_product_matrix': dev_user_product_matrix, 
        'user_index': dev_user_index, 
        'product_index': dev_product_index,
        'corpus': dev_corpus,
        'corpus_embeddings': dev_corpus_embeddings 
    }
]


In [None]:
process_and_save(file_data_list, tokenizer, limit, retrieval_method="bm_25")

In [None]:
# import json

# # Function to update user_review_text in the output file using data from user_product_matrix
# def update_user_review_text(output_data, user_product_matrix):
#     for entry in output_data:
#         user_id = entry['user_id']
#         product_id = entry['product_id']
        
#         # Check if the user and product exist in the matrix
#         if (user_id, product_id) in user_product_matrix:
#             # Retrieve and update the review text and title
#             review_data = user_product_matrix.get((user_id, product_id), {})
#             entry['user_review_text'] = review_data.get('reviewText', 'No review text available')
#             entry['user_review_title'] = review_data.get('reviewTitle', 'No review title available')
#         else:
#             # Handle missing cases explicitly
#             print(f"No matching review found for user_id {user_id} and product_id {product_id}.")
#             entry['user_review_text'] = 'No review text available'
#             entry['user_review_title'] = 'No review title available'
    
#     return output_data


# # Load the output JSON file (amazon_title_generation.json)
# output_file = "../data/AmazonReview/amazon_title_generation_questions_dev_ranked_k_5_reviewText.json" 
# with open(output_file, 'r') as file:
#     output_data = json.load(file)

# # Update the user_review_text in the output file
# updated_data = update_user_review_text(output_data, dev_user_product_matrix)

# # Save the updated data back to the output file
# with open(output_file, 'w') as file:
#     json.dump(updated_data, file, indent=4)

# print(f"Updated JSON saved to {output_file}")


In [None]:
from IPython.display import display
from ipywidgets import Button

def shutdown_kernel():
    from IPython.display import display
    display("Shutting down kernel...")
    get_ipython().kernel.do_shutdown(True)

shutdown_kernel()
