In [None]:
import pandas as pd
import random
from datasets import load_dataset
from collections import defaultdict
import tqdm
from datetime import datetime, timedelta
import logging
import json

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [None]:
dataset_configs = {
    'amazon': {
        'columns': {
            'user_id': 'user_id',
            'title': 'title',
            'text': 'text',
            'rating': 'rating',
            'product_id': 'asin'
        },
        'split_sizes': {
            'train': 20000,
            'dev': 2500,
            'test': 2500
        },
        'data_source': 'load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty")',  # Amazon loads from a dataset source
        'file_name_base': 'amazon'  
    },
    'b2w': {
        'columns': {
            'user_id': 'reviewer_id', 
            'title': 'review_title',
            'text': 'review_text',
            'rating': 'overall_rating',
            'product_id': 'product_id' 
        },
        'split_sizes': {
            'train': 20000,
            'dev': 2500,
            'test': 2500
        },
        'data_source': '/path_to/b2w_dataset.csv',  # Path for the B2W dataset CSV file
        'file_name_base': 'b2w'  # Base name for file output
    },
    'gap': {
        'columns': {
            'user_id': 'reviews.username',         # Add user_id column for GAP
            'title': 'reviews.title',
            'text': 'reviews.text',
            'rating': 'reviews.rating',
            'product_id': 'id'  
        },
        'split_sizes': {
            'train': 20000,
            'dev': 2500,
            'test': 2500
        },
        'data_source': '/path_to/gap_dataset.csv',  # Path for the GAP dataset CSV file
        'file_name_base': 'gap'  
    },
    'hotel': {
        'columns': {
            'user_id': 'reviews.username',         # Add user_id column for Hotel dataset
            'title': 'reviews.title',
            'text': 'reviews.text',
            'rating': 'reviews.rating',
            'product_id': 'id' 
        },
        'split_sizes': {
            'train': 9000,
            'dev': 2500,
            'test': 2500
        },
        'data_sources': [
            '/path_to/Datafiniti_Hotel_Reviews.csv',  # First Hotel dataset file
            '/path_to/Datafiniti_Hotel_Reviews_Jun19.csv'  # Second Hotel dataset file
        ],
        'file_name_base': 'hotel'  
    }
}

In [None]:
def load_and_clean_b2w(file_path):
    """
    Load the B2W dataset and clean it by:
    - Dropping rows with any NaN values across all columns.
    """
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Drop rows with any NaN values across all columns
    cleaned_data = data.dropna()

    return cleaned_data

In [None]:
def load_and_clean_gap(file_path):
    """
    Load and clean the GAP dataset.
    - Replace empty strings with NaN
    - Drop rows with missing critical columns
    """
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Replace empty strings with NaN
    data.replace('', pd.NA, inplace=True)

    # Drop rows where specified columns have missing values
    cleaned_data = data.dropna(subset=['id', 'reviews.title', 'reviews.text', 'reviews.username'])

    return cleaned_data


In [None]:
# Function to load, preprocess, filter, and organize data by user_id
def load_and_preprocess_amazon(dataset_name, split_name, cutoff_date, dataset_config, trust_remote_code=True):
    """
    Load, preprocess, filter, and organize the Amazon dataset by:
    - Loading the dataset
    - Filtering by timestamp cutoff
    - Organizing the data by user_id
    
    Args:
    - dataset_name: str, name of the dataset (e.g., "McAuley-Lab/Amazon-Reviews-2023")
    - split_name: str, split of the dataset to load (e.g., "raw_review_All_Beauty")
    - cutoff_date: datetime, the date to filter reviews older than this cutoff date
    - dataset_config: dict, configuration containing column mappings, etc.
    
    Returns:
    - users_list: List of user data organized by user_id
    """
    # Load the dataset using Hugging Face datasets library
    logging.info(f"Loading {dataset_name} dataset...")
    dataset = load_dataset(dataset_name, split_name, trust_remote_code=trust_remote_code)
    
    # Access the "full" part of the dataset (if that's the required key)
    ds = dataset["full"]  # Access the "full" part of the dataset

    # Extract the user_id column from dataset config
    user_id_column = dataset_config['columns']['user_id']
    
    # Filter dataset by timestamp cutoff and organize by user_id
    logging.info("Filtering dataset for time cutoff and organizing data by user_id")

    def convert_timestamp(timestamp):
        """Convert timestamp from milliseconds to datetime."""
        return datetime.fromtimestamp(timestamp / 1000)

    # Initialize dictionary to store organized data by user_id
    d_dic = defaultdict(list)
    
    # Process and filter the dataset
    for d in tqdm.tqdm(ds):
        if convert_timestamp(d['timestamp']) > cutoff_date:
            user_id = d[user_id_column]  # Access user_id dynamically
            d_dic[user_id].append(d)

    # Convert dictionary to list of users
    users_list = list(d_dic.items())
    return users_list

# Set the cutoff date for Amazon dataset (3 years ago from Jan 1, 2024)
FIXED_CUTOFF_DATE = datetime(2024, 1, 1)
DATE_CUTOFF = FIXED_CUTOFF_DATE - timedelta(days=3 * 365)  # 3 years ago

In [None]:
def load_merge_and_clean_hotel(files):
    """
    Load multiple hotel review CSV files, merge them, and clean the data.
    - Replace empty strings with NaN
    - Drop rows with missing values in key columns
    """
    # Load the CSV files into DataFrames
    dfs = [pd.read_csv(file) for file in files]

    # Concatenate the DataFrames vertically
    merged_df = pd.concat(dfs, ignore_index=True)

    # Replace empty strings with NaN
    merged_df.replace('', pd.NA, inplace=True)

    # Drop rows where specified columns have missing values
    cleaned_data = merged_df.dropna(subset=['id', 'reviews.title', 'reviews.text', 'reviews.username'])

    return cleaned_data


In [None]:
# Organize data by user_id dynamically based on the dataset configuration
def organize_data_by_user_id(filtered_ds, columns):
    logging.info("Organizing data by user_id")
    d_dic = defaultdict(list)

    # Use the dynamic 'user_id' column from the configuration
    user_id_column = columns['user_id']

    for d in filtered_ds:
        user_id = d[user_id_column]  # Access user_id dynamically
        d_dic[user_id].append(d)

    # Convert dictionary to list of users
    users_list = list(d_dic.items())
    return users_list

# Function to build global product counts across the entire dataset
def build_global_product_counts(users, columns):
    product_count = defaultdict(list)  # Store list of users who reviewed each product
    for user_id, reviews in users:
        # Focus on the first review in each user's profile
        first_review = reviews[0]
        product_count[first_review[columns['product_id']]].append(user_id)  # Track which users reviewed the first product
    return product_count

# Function to pick a random review that has global neighbors, or exclude the user if none do
def set_random_first_review_with_global_neighbor(user_reviews, global_product_counts, columns):
    # Filter reviews that have at least one global neighbor
    valid_reviews = [review for review in user_reviews if len(global_product_counts[review[columns['asin']]]) > 1]
    
    if valid_reviews:
        random_review = random.choice(valid_reviews)  # Pick a random review with global neighbors
        user_reviews.remove(random_review)
        user_reviews.insert(0, random_review)
    else:
        return None  # No reviews with global neighbors, exclude the user
    
    return user_reviews

# Helper function to add a user to a specified split
def add_user_to_split(user_id, user_reviews, split_users, local_product_counts, added_users, columns):
    # Add the user to the split
    split_users.append((user_id, user_reviews))
    added_users.add(user_id)
    # Update local product counts for the split
    for review in user_reviews:
        local_product_counts[review[columns['asin']]] += 1

def distribute_global_neighbors(users, num_users_train, num_users_dev, num_users_test, global_product_counts):
    """
    Distribute users into train, dev, and test splits based on global neighbors.
    
    Args:
        users (list): List of users, each represented as a tuple (user_id, reviews).
        num_users_train (int): The number of users for the training set.
        num_users_dev (int): The number of users for the development set.
        num_users_test (int): The number of users for the test set.
        global_product_counts (dict): A dictionary mapping product IDs to lists of user IDs who reviewed them.
        
    Returns:
        train_users (list), dev_users (list), test_users (list), local_product_counts_train (dict), 
        local_product_counts_dev (dict), local_product_counts_test (dict): The split datasets and product counts.
    """
    train_users = []
    dev_users = []
    test_users = []

    # Initialize local product counts for train, dev, and test splits
    local_product_counts_train = defaultdict(int)
    local_product_counts_dev = defaultdict(int)
    local_product_counts_test = defaultdict(int)

    added_users = set()
    
    # Exclude products with only one review
    filtered_product_list = [asin for asin, user_ids in global_product_counts.items() if len(user_ids) > 1]

    random.seed(42)
    random.shuffle(filtered_product_list)

    for asin in filtered_product_list:
        user_ids = global_product_counts[asin]
        random.shuffle(user_ids)

        # Add users to the splits
        for user_id in user_ids:
            if user_id in added_users:
                continue

            user_reviews = [user for user in users if user[0] == user_id][0][1]

            if len(train_users) < num_users_train:
                train_users.append((user_id, user_reviews))
                added_users.add(user_id)
            elif len(dev_users) < num_users_dev:
                dev_users.append((user_id, user_reviews))
                added_users.add(user_id)
            elif len(test_users) < num_users_test:
                test_users.append((user_id, user_reviews))
                added_users.add(user_id)
                
            if len(train_users) >= num_users_train and len(dev_users) >= num_users_dev and len(test_users) >= num_users_test:
                break

    return train_users, dev_users, test_users, local_product_counts_train, local_product_counts_dev, local_product_counts_test

In [None]:
# General function to process user data (without predictions)
def process_user_data(users, columns):
    """
    Process user data for any dataset using the provided column configuration.
    
    Args:
        users: A list of users where each user is represented as a tuple (user_id, reviews).
        columns: A dictionary mapping column names to the actual column names in the dataset.
        
    Returns:
        out: A list of processed user data.
    """
    out = []
    
    for user_id, reviews in tqdm.tqdm(users):
        # Generate user input and output
        user_entry = {"id": user_id}

        # Add full user profile to user_entry
        user_entry["profile"] = [
            {
                'review_title': d[columns['title']],  # Title of the review
                'review_text': d[columns['text']],    # Text of the review
                'overall_rating': d[columns['rating']], # Overall rating of the review
                'product_id': d[columns['product_id']]  # Product ID (or relevant ID)
            }
            for d in reviews
        ]

        # Append entries to output lists
        out.append(user_entry)
    
    return out

# Function to write data to files 
def write_to_file(data, file_suffix, file_name_base="dataset"):
    file_name = f"{file_name_base}_{file_suffix}.json"
    with open(file_name, "w") as file:
        json.dump(data, file, indent=4)

# Function to process and save dataset dynamically
def process_and_save_dataset(dataset_name, train_users, dev_users, test_users, dataset_configs):
    """
    Process and save train, dev, and test data for the selected dataset type.
    
    Args:
        dataset_name: The name of the dataset (e.g., 'amazon', 'b2w', 'gap', 'hotel').
        train_users, dev_users, test_users: Lists of users for train, dev, and test sets.
        dataset_configs: Dictionary with configurations for all datasets.
    """
    # Logging info
    logging.info(f"Processing {dataset_name} data")

    # Get the dataset config for the selected dataset
    dataset_config = dataset_configs[dataset_name]

    # Process the train, test, and dev sets
    out_train = process_user_data(train_users, dataset_config['columns'])
    out_test = process_user_data(test_users, dataset_config['columns'])
    out_dev = process_user_data(dev_users, dataset_config['columns'])

    # Logging and writing the files
    logging.info(f"Writing {dataset_name} data to files")
    write_to_file(out_train, "train", dataset_config['file_name_base'])
    write_to_file(out_dev, "dev", dataset_config['file_name_base'])
    write_to_file(out_test, "test", dataset_config['file_name_base'])

    # Log completion
    logging.info(f"{dataset_name.capitalize()} data successfully written to files.")

In [None]:
dataset_name = 'amazon'  # Change to the dataset you're processing (e.g., 'b2w', 'gap', 'hotel')
dataset_config = dataset_configs[dataset_name]

d_dic_amazon = load_and_preprocess_amazon(
    dataset_name="McAuley-Lab/Amazon-Reviews-2023", 
    split_name="raw_review_All_Beauty", 
    cutoff_date=DATE_CUTOFF,
    dataset_config = dataset_configs['amazon']
)

# Global product counts for the entire dataset
global_product_counts = build_global_product_counts(d_dic_amazon, dataset_config['columns'])

train_users, dev_users, test_users, local_product_counts_train, local_product_counts_dev, local_product_counts_test = distribute_global_neighbors(
    d_dic_amazon, dataset_config['split_sizes']['train'], dataset_config['split_sizes']['dev'], dataset_config['split_sizes']['test'], global_product_counts
)

process_and_save_dataset(dataset_name, train_users, dev_users, test_users, dataset_configs)

# users_list = organize_data_by_user_id(filtered_ds, dataset_config['columns'])

# # Process B2W dataset
# processed_b2w_data, b2w_predictions = process_user_data(users_list, dataset_configs['b2w']['columns'])

# # Process GAP dataset
# processed_gap_data, gap_predictions = process_user_data(users_list, dataset_configs['gap']['columns'])

# # Process Hotel dataset
# processed_hotel_data, hotel_predictions = process_user_data(users_list, dataset_configs['hotel']['columns'])
