In [79]:
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
import pickle
import json

In [80]:
DATASET_NAME = "synthetic_easy_1000" 
DATASET_COMMENT = "Easy synthetic dataset with one group of 1000 fraudsters"
n_users = 10000   # Number of users
n_items = 100     # Number of items
n_ratings = 30000 # Number of ratings
fraud_group_sizes = [(1000, 10)]
additive_noise = 0
subtractive_noise = 0

In [81]:
def build_synthetic_dataset(n_users: int, n_items: int, n_reviews: int, fraud_group_sizes: list):
    """ Creates the graph_u2p and labels for a synthetic dataset. 
        In the synthetic dataset generated, every fraudster will review every target item for their group.
        However, they may also have reviewed other items as well.
        This function can be used in combination with add_noise to add additional noise to the dataset.
    
    Args:
        n_users: Number of users in the dataset.
        n_items: Number of items in the dataset.
        n_reviews: Number of non-fraudulent reviews in the dataset.
        fraud_group_sizes: List of tuples. Each tuple is in format (n_users_in_group, n_items_targeted)

    Returns:
        graph_u2i: Sparse matrix of shape (n_users, n_items) with 1s in the positions where a user has reviewed an item.
        labels: array of labels for each user in the dataset. 1 if the user is fraudulent, 0 otherwise.
    """
    # Initialize empty graph_u2i and labels
    graph_u2i = csr_matrix((n_users, n_items), dtype=np.int8)
    labels = np.zeros(n_users, dtype=np.int8)

    # Generate non-fradulent reviews
    remaining_reviews = n_reviews
    remaining_users = n_users
    for user in range(n_users):
        avg_reviews_per_user = remaining_reviews / remaining_users
        num_to_review = int(np.random.normal(loc=avg_reviews_per_user, scale=avg_reviews_per_user/2))
        num_to_review = min(num_to_review, remaining_reviews, n_items)
        num_to_review = max(num_to_review, 1)
        
        # Fill the graph_u2i matrix with 1s
        items = np.random.choice(n_items, num_to_review, replace=False)
        graph_u2i[user, items] = 1
        
        # Update remaining reviews and users
        remaining_reviews -= num_to_review
        remaining_users -= 1

    # Generate fraudulent reviews
    # For each fraudulent group, randomly choose that many users to be fraudulent
    # For each fraudulent group, we will choose some number of items to target
    # All users in the fraudulent group will review all items in the group
    for group_size, num_items_targeted in fraud_group_sizes:
        users = np.random.choice(n_users, group_size, replace=False)
        items = np.random.choice(n_items, num_items_targeted, replace=False)
        graph_u2i[users[:, None], items] = 1
        labels[users] = 1
    
    return graph_u2i, labels

In [82]:
def add_noise_to_fraudsters(graph_u2i, labels, additive_noise, subtractive_noise):
    """ For each fraudster, this randomly adds and removes edges from the graph_u2i matrix.
    
    Args:
        graph_u2i: Sparse matrix of shape (n_users, n_items) with 1s in the positions where a user has reviewed an item.
        labels: array of labels for each user in the dataset. 1 if the user is fraudulent, 0 otherwise.
        additive_noise: number of edges to add to each fraudster
        subtractive_noise: number of edges to remove from each fraudster
    """

    # For each fraudster, add and remove edges
    fraudsters = np.where(labels == 1)[0]
    for fraudster in fraudsters:
        dense_row = graph_u2i[fraudster].toarray()[0]

        # Add edges     
        items_to_add = np.random.choice(np.where(dense_row == 0)[0], additive_noise, replace=False)
        graph_u2i[fraudster, items_to_add] = 1

        # Remove edges
        items_to_remove = np.random.choice(np.where(dense_row == 1)[0], subtractive_noise, replace=False)
        graph_u2i[fraudster, items_to_remove] = 0


In [83]:
graph_u2i, labels = build_synthetic_dataset(n_users, n_items, n_ratings, fraud_group_sizes)
add_noise_to_fraudsters(graph_u2i, labels, additive_noise, subtractive_noise)