In [1]:
!pip install datasets



In [2]:
import pandas as pd
import random
from datasets import load_dataset
from collections import defaultdict
import tqdm
from datetime import datetime, timedelta
import logging

logging.basicConfig(level=logging.INFO)

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
ds = dataset["full"]

# Use a fixed cutoff date (e.g., January 1, 2024)
FIXED_CUTOFF_DATE = datetime(2024, 1, 1)
DATE_CUTOFF = FIXED_CUTOFF_DATE - timedelta(days=3*365) 

In [3]:
def convert_timestamp(timestamp):
    return datetime.fromtimestamp(timestamp / 1000)

logging.info("Filtering dataset for time cutoff")
filtered_ds = [d for d in tqdm.tqdm(ds) if convert_timestamp(d['timestamp']) > DATE_CUTOFF]

# Organize data by user_id
logging.info("Organizing data by user_id")
d_dic = defaultdict(list)
for d in filtered_ds:
    d_dic[d["user_id"]].append(d)

# Print the number of user profiles in the filtered dataset
num_user_profiles = len(d_dic)
logging.info(f"Number of user profiles in the filtered dataset: {num_user_profiles}")
print(f"Number of user profiles in the filtered dataset: {num_user_profiles}")

# Calculate the distribution of user profile sizes in the main dataset
user_profile_sizes = {user_id: len(reviews) for user_id, reviews in d_dic.items()}
user_profile_df = pd.DataFrame(list(user_profile_sizes.items()), columns=['user_id', 'profile_size'])
filtered_profile_size_distribution = user_profile_df['profile_size'].value_counts(normalize=True).sort_index()

# Print the distribution of profile sizes in the filtered dataset
logging.info("Profile size distribution in the filtered dataset:")
print(filtered_profile_size_distribution)
print(len(filtered_profile_size_distribution))


INFO:root:Filtering dataset for time cutoff
100%|██████████████| 701528/701528 [00:37<00:00, 18670.56it/s]
INFO:root:Organizing data by user_id
INFO:root:Number of user profiles in the filtered dataset: 184701
INFO:root:Profile size distribution in the filtered dataset:


Number of user profiles in the filtered dataset: 184701
profile_size
1     0.946633
2     0.042425
3     0.005571
4     0.002317
5     0.000893
6     0.000666
7     0.000303
8     0.000227
9     0.000179
10    0.000092
11    0.000103
12    0.000049
13    0.000065
14    0.000060
15    0.000027
16    0.000054
17    0.000043
18    0.000027
19    0.000032
20    0.000011
21    0.000005
22    0.000022
23    0.000011
24    0.000032
26    0.000027
27    0.000005
28    0.000005
29    0.000011
31    0.000011
32    0.000005
35    0.000005
40    0.000016
41    0.000005
42    0.000005
45    0.000005
48    0.000005
52    0.000005
55    0.000005
58    0.000005
62    0.000005
64    0.000005
73    0.000005
74    0.000005
81    0.000005
Name: proportion, dtype: float64
44


In [4]:
# Calculate average review length size
logging.info("Calculating average review length size")
total_review_length = 0
total_reviews = 0

for d in filtered_ds:
    review_length = len(d['text'])  # Length of each review text
    total_review_length += review_length
    total_reviews += 1

if total_reviews > 0:
    average_review_length = total_review_length / total_reviews
else:
    average_review_length = 0

logging.info(f"Total number of reviews: {total_reviews}")
logging.info(f"Average review length: {average_review_length}")
print(f"Total number of reviews: {total_reviews}")
print(f"Average review length: {average_review_length}")

INFO:root:Calculating average review length size
INFO:root:Total number of reviews: 200801
INFO:root:Average review length: 165.64270098256483


Total number of reviews: 200801
Average review length: 165.64270098256483


In [5]:
# Calculate the distribution of reviews per product (product-review distribution)
product_review_counts = defaultdict(int)
for review in filtered_ds:
    product_review_counts[review['asin']] += 1

def print_product_review_distribution_with_percentages(local_product_counts, split_name):
    # Calculate the distribution of review counts
    review_count_distribution = defaultdict(int)
    
    for asin, count in local_product_counts.items():
        review_count_distribution[count] += 1
    
    total_products = sum(review_count_distribution.values())
    print(f"Review Count Distribution for {split_name} split:")
    sorted_review_distribution = sorted(review_count_distribution.items(), key=lambda x: x[0])
    
    for review_count, product_count in sorted_review_distribution:
        percentage = (product_count / total_products) * 100
        print(f"Products with {review_count} reviews: {product_count} ({percentage:.2f}%)")
    print("\n")

print_product_review_distribution_with_percentages(product_review_counts, "Filtered Dataset")

Review Count Distribution for Filtered Dataset split:
Products with 1 reviews: 25529 (49.70%)
Products with 2 reviews: 9486 (18.47%)
Products with 3 reviews: 4782 (9.31%)
Products with 4 reviews: 2636 (5.13%)
Products with 5 reviews: 1836 (3.57%)
Products with 6 reviews: 1213 (2.36%)
Products with 7 reviews: 925 (1.80%)
Products with 8 reviews: 699 (1.36%)
Products with 9 reviews: 537 (1.05%)
Products with 10 reviews: 443 (0.86%)
Products with 11 reviews: 367 (0.71%)
Products with 12 reviews: 309 (0.60%)
Products with 13 reviews: 249 (0.48%)
Products with 14 reviews: 220 (0.43%)
Products with 15 reviews: 182 (0.35%)
Products with 16 reviews: 178 (0.35%)
Products with 17 reviews: 144 (0.28%)
Products with 18 reviews: 123 (0.24%)
Products with 19 reviews: 115 (0.22%)
Products with 20 reviews: 103 (0.20%)
Products with 21 reviews: 95 (0.18%)
Products with 22 reviews: 80 (0.16%)
Products with 23 reviews: 66 (0.13%)
Products with 24 reviews: 74 (0.14%)
Products with 25 reviews: 50 (0.10%)
P

In [6]:
# Count global neighbors for the first review in each user's profile
global_neighbor_counts = defaultdict(int)

#Need to edit this to adjust with random review neighbor global count
# Go through the filtered dataset and focus on the first review for each user
for user_id, user_reviews in d_dic.items():
    first_review_asin = user_reviews[0]['asin']  # Focus on the first review
    global_neighbor_counts[first_review_asin] += 1  # Increment count for this product globally

def print_global_neighbor_distribution_with_percentages(global_neighbor_counts, split_name):
    # Calculate the distribution of neighbor counts
    neighbor_count_distribution = defaultdict(int)
    
    for asin, count in global_neighbor_counts.items():
        neighbor_count_distribution[count] += 1
    
    # Calculate total number of products
    total_products = sum(neighbor_count_distribution.values())
    
    # Print the distribution with percentages
    print(f"Global Neighbor Count Distribution for {split_name} split (based on first review):")
    sorted_neighbor_distribution = sorted(neighbor_count_distribution.items(), key=lambda x: x[0])
    
    for neighbor_count, product_count in sorted_neighbor_distribution:
        percentage = (product_count / total_products) * 100
        print(f"Products with {neighbor_count} reviews: {product_count} ({percentage:.2f}%)")
    print("\n")

print_global_neighbor_distribution_with_percentages(global_neighbor_counts, "Global Neighbors (First Review)")

Global Neighbor Count Distribution for Global Neighbors (First Review) split (based on first review):
Products with 1 reviews: 25042 (51.07%)
Products with 2 reviews: 9088 (18.53%)
Products with 3 reviews: 4451 (9.08%)
Products with 4 reviews: 2402 (4.90%)
Products with 5 reviews: 1681 (3.43%)
Products with 6 reviews: 1137 (2.32%)
Products with 7 reviews: 828 (1.69%)
Products with 8 reviews: 632 (1.29%)
Products with 9 reviews: 496 (1.01%)
Products with 10 reviews: 401 (0.82%)
Products with 11 reviews: 313 (0.64%)
Products with 12 reviews: 303 (0.62%)
Products with 13 reviews: 217 (0.44%)
Products with 14 reviews: 178 (0.36%)
Products with 15 reviews: 173 (0.35%)
Products with 16 reviews: 151 (0.31%)
Products with 17 reviews: 127 (0.26%)
Products with 18 reviews: 111 (0.23%)
Products with 19 reviews: 94 (0.19%)
Products with 20 reviews: 82 (0.17%)
Products with 21 reviews: 68 (0.14%)
Products with 22 reviews: 64 (0.13%)
Products with 23 reviews: 55 (0.11%)
Products with 24 reviews: 68 

In [7]:
# Number of users per set
num_users_train = 20000  # Set the desired number of users in the training set
num_users_dev = 2500
num_users_test = 2500

# Convert dictionary to list of users
users_list = list(d_dic.items())
    
# Build global product counts across the entire dataset
def build_global_product_counts(users):
    product_count = defaultdict(list)  # Store list of users who reviewed each product
    for user_id, reviews in users:
        # Focus on the first review in each user's profile
        first_review = reviews[0]
        product_count[first_review['asin']].append(user_id)  # Track which users reviewed the first product
    return product_count

# Global product counts for the entire dataset
global_product_counts = build_global_product_counts(users_list)

# Function to pick a random review that has global neighbors, or exclude the user if none do
def set_random_first_review_with_global_neighbor(user_reviews, global_product_counts):
    # Filter reviews that have at least one global neighbor
    valid_reviews = [review for review in user_reviews if len(global_product_counts[review['asin']]) > 1]
    
    if valid_reviews:
        random_review = random.choice(valid_reviews)  # Pick a random review with global neighbors
        user_reviews.remove(random_review)
        user_reviews.insert(0, random_review)
    else:
        return None  # No reviews with global neighbors, exclude the user
    
    return user_reviews

# Helper function to add a user to a specified split
def add_user_to_split(user_id, user_reviews, split_users, local_product_counts, added_users):
    # Add the user to the split
    split_users.append((user_id, user_reviews))
    added_users.add(user_id)
    # Update local product counts for the split
    for review in user_reviews:
        local_product_counts[review['asin']] += 1

# Function to distribute global neighbor groups across train, dev, and test splits with local neighbor check
def distribute_global_neighbors(users, num_users_train, num_users_dev, num_users_test, global_product_counts):
    train_users = []
    dev_users = []
    test_users = []
    
    # Initialize local product counts for train, dev, and test splits
    local_product_counts_train = defaultdict(int)
    local_product_counts_dev = defaultdict(int)
    local_product_counts_test = defaultdict(int)
    
    # Track which users have already been added
    added_users = set()

    # Exclude products with only one review
    filtered_product_list = [asin for asin, user_ids in global_product_counts.items() if len(user_ids) > 1]

    # Set the seed before shuffling to ensure reproducibility
    random.seed(RANDOM_SEED)
    random.shuffle(filtered_product_list)

    # Go through each product's global neighbors in a random order
    for asin in filtered_product_list:
        user_ids = global_product_counts[asin]

        # Set the seed before shuffling users to ensure reproducibility
        random.seed(RANDOM_SEED)
        random.shuffle(user_ids)

        # Add the first two users to the training set
        if len(train_users) < num_users_train - 1:
            for user_id in user_ids[:2]:  # Add the first two users to the training set
                if user_id in added_users:
                    continue  # Skip users who have already been added

                # Get the user's reviews
                user_reviews = [user for user in users if user[0] == user_id][0][1]

                # Add the user to the training split
                add_user_to_split(user_id, user_reviews, train_users, local_product_counts_train, added_users)

        # Add the next two users to the dev set if training set is filled
        elif len(dev_users) < num_users_dev - 1:
            for user_id in user_ids[:2]:  # Add the next two users to the dev set
                if user_id in added_users:
                    continue  # Skip users who have already been added

                # Get the user's reviews
                user_reviews = [user for user in users if user[0] == user_id][0][1]

                # Add the user to the dev split
                add_user_to_split(user_id, user_reviews, dev_users, local_product_counts_dev, added_users)

        # Add the next two users to the test set if dev set is filled
        elif len(test_users) < num_users_test - 1:
            for user_id in user_ids[:2]:  # Add the next two users to the test set
                if user_id in added_users:
                    continue  # Skip users who have already been added

                # Get the user's reviews
                user_reviews = [user for user in users if user[0] == user_id][0][1]

                # Add the user to the test split
                add_user_to_split(user_id, user_reviews, test_users, local_product_counts_test, added_users)

        # Add subsequent users for this product, ensuring they have local neighbors
        for i, user_id in enumerate(user_ids[2:], start=2):
            if user_id in added_users:
                continue  # Skip users who have already been added

            # Get the user's reviews
            user_reviews = [user for user in users if user[0] == user_id][0][1]

            # Ensure subsequent users have local neighbors in the train split
            if len(train_users) < num_users_train:
                if local_product_counts_train[asin] > 0:  # Ensure local neighbor exists
                    add_user_to_split(user_id, user_reviews, train_users, local_product_counts_train, added_users)

            # Ensure subsequent users have local neighbors in the dev split
            elif len(dev_users) < num_users_dev:
                if local_product_counts_dev[asin] > 0:  # Ensure local neighbor exists
                    add_user_to_split(user_id, user_reviews, dev_users, local_product_counts_dev, added_users)

            # Ensure subsequent users have local neighbors in the test split
            elif len(test_users) < num_users_test:
                if local_product_counts_test[asin] > 0:  # Ensure local neighbor exists
                    add_user_to_split(user_id, user_reviews, test_users, local_product_counts_test, added_users)
            
            # Stop if all splits are full
            if len(train_users) >= num_users_train and len(dev_users) >= num_users_dev and len(test_users) >= num_users_test:
                break

    return train_users, dev_users, test_users, local_product_counts_train, local_product_counts_dev, local_product_counts_test


In [None]:
train_users, dev_users, test_users, local_product_counts_train, local_product_counts_dev, local_product_counts_test = distribute_global_neighbors(
    users_list, num_users_train, num_users_dev, num_users_test, global_product_counts
)

# Check the number of users added to each set
logging.info(f"Number of users in the test set: {len(test_users)}")
logging.info(f"Number of users in the dev set: {len(dev_users)}")

In [None]:
# Function to get the profile size distribution
def get_profile_size_distribution(users):
    profile_sizes = [len(reviews) for user_id, reviews in users]
    return pd.Series(profile_sizes).value_counts().sort_index()

# Function to calculate the local neighbor count for each user's first review in the split
def get_local_neighbor_count(users, local_product_counts):
    user_neighbor_counts = []
    
    for user_id, reviews in users:
        first_review = reviews[0]  # Focus on the first review in the profile
        asin = first_review['asin']
        local_neighbors = local_product_counts[asin] - 1  # Subtract 1 to exclude the current review
        user_neighbor_counts.append((user_id, local_neighbors))
    
    return pd.Series([count for _, count in user_neighbor_counts]).value_counts().sort_index()

# Function to print the local neighbor count distribution with percentages
def print_local_neighbor_distribution_with_percentages(local_neighbor_count, split_name):
    total_users = local_neighbor_count.sum()  # Total number of users

    # Print the distribution with percentages
    print(f"{split_name} Local Neighbor Count Distribution (for the first review):")
    for neighbor_count, count in local_neighbor_count.items():
        percentage = (count / total_users) * 100
        print(f"Users with {neighbor_count} neighbors: {count} ({percentage:.2f}%)")
    print("\n")

# Function to print the profile size distribution with percentages
def print_profile_size_distribution_with_percentages(profile_size_distribution, split_name):
    total_users = profile_size_distribution.sum()  # Total number of users

    # Print the distribution with percentages
    print(f"{split_name} Profile Size Distribution:")
    for profile_size, count in profile_size_distribution.items():
        percentage = (count / total_users) * 100
        print(f"Users with profile size {profile_size}: {count} ({percentage:.2f}%)")
    print("\n")

# Calculate the local neighbor count for each split (for the first review only)
train_local_neighbor_count = get_local_neighbor_count(train_users, local_product_counts_train)
dev_local_neighbor_count = get_local_neighbor_count(dev_users, local_product_counts_dev)
test_local_neighbor_count = get_local_neighbor_count(test_users, local_product_counts_test)

# Calculate the profile size distributions for the train, dev, and test sets
train_profile_size_distribution = get_profile_size_distribution(train_users)
dev_profile_size_distribution = get_profile_size_distribution(dev_users)
test_profile_size_distribution = get_profile_size_distribution(test_users)

# Print local neighbor count distributions with percentages for each split
print_local_neighbor_distribution_with_percentages(train_local_neighbor_count, "Train")
print_local_neighbor_distribution_with_percentages(dev_local_neighbor_count, "Dev")
print_local_neighbor_distribution_with_percentages(test_local_neighbor_count, "Test")

# Print profile size distributions with percentages for each split
print_profile_size_distribution_with_percentages(train_profile_size_distribution, "Train")
print_profile_size_distribution_with_percentages(dev_profile_size_distribution, "Dev")
print_profile_size_distribution_with_percentages(test_profile_size_distribution, "Test")


In [None]:
def print_product_review_distribution_with_percentages(local_product_counts, split_name):
    # Calculate the distribution of review counts
    review_count_distribution = defaultdict(int)
    
    for asin, count in local_product_counts.items():
        review_count_distribution[count] += 1
    
    # Calculate total number of products
    total_products = sum(review_count_distribution.values())
    
    # Print the distribution with percentages
    print(f"Review Count Distribution for {split_name} split:")
    sorted_review_distribution = sorted(review_count_distribution.items(), key=lambda x: x[0])
    
    for review_count, product_count in sorted_review_distribution:
        percentage = (product_count / total_products) * 100
        print(f"Products with {review_count} reviews: {product_count} ({percentage:.2f}%)")
    print("\n")
    
print_product_review_distribution_with_percentages(local_product_counts_train, "Train")
print_product_review_distribution_with_percentages(local_product_counts_dev, "Dev")
print_product_review_distribution_with_percentages(local_product_counts_test, "Test")

In [None]:
import json
import tqdm

# Function to process user data (unchanged, but you can modify if needed)
def process_user_data(users):
    out = []
    predictions = []
    for user_id, reviews in tqdm.tqdm(users):
        # Generate user input and output (as done before)
        user_entry = {"id": user_id}
        pred_entry = {"id": user_id, "output": reviews[0]['title']}

        # Add full user profile to user_entry
        user_entry["profile"] = [
            {
                'rating': d['rating'], 
                'title': d['title'],
                'text': d['text'],
                # Uncomment the lines below if you want to include these fields
                # 'timestamp': d['timestamp'],
                # 'helpful_vote': d['helpful_vote'],
                # 'verified_purchase': d['verified_purchase'],
                 "productAsin": d['asin']
            }
            for d in reviews
        ]

        # Append entries to output lists
        out.append(user_entry)
        predictions.append(pred_entry)
    return out, predictions

# Logging info
logging.info("Processing train, test, and dev data")

# Process the train, test, and dev sets
out_train, predictions_train = process_user_data(train_users)
out_test, predictions_test = process_user_data(test_users)
out_dev, predictions_dev = process_user_data(dev_users)

# File name base (you can modify this as needed)
file_name_base = "amazon_title_generation"

# Function to write data to files
def write_to_file(data, file_suffix):
    file_name = f"{file_name_base}_{file_suffix}.json"
    with open(file_name, "w") as file:
        json.dump(data, file, indent=4)

# Logging and writing the files
logging.info("Writing data to files")
write_to_file(out_train, "questions_train2")
write_to_file(predictions_train, "outputs_train2")
write_to_file(out_test, "questions_test2")
write_to_file(predictions_test, "outputs_test2")
write_to_file(out_dev, "questions_dev2")
write_to_file(predictions_dev, "outputs_dev2")

# Log completion
logging.info("Data successfully written to files.")
