In [7]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
import pickle

In [8]:
df = pd.read_csv(Path.cwd() / "yelpnyc" / "metadata.csv")
df

Unnamed: 0,Reviewer_id,Product_id,Rating,Label,Date
0,923,0,3,-1,12/8/2014
1,924,0,3,-1,5/16/2013
2,925,0,4,-1,7/1/2013
3,926,0,4,-1,7/28/2011
4,927,0,4,-1,11/1/2010
...,...,...,...,...,...
359047,161146,349,5,1,2/6/2014
359048,116424,349,5,1,1/31/2014
359049,161147,349,5,1,1/30/2014
359050,97930,349,5,1,1/25/2014


In [9]:
# Create a mapping of unique reviewers and products
unique_reviewers = df['Reviewer_id'].unique()
unique_products = df['Product_id'].unique()

num_reviews = len(df)
num_reviewers = len(unique_reviewers)
num_products = len(unique_products)

print(f"{num_reviews} reviews")
print(f"{num_reviewers} reviewers")
print(f"{num_products} products")

359052 reviews
160225 reviewers
923 products


In [13]:
# Create dictionaries to map reviewers and products to indices
reviewer_to_index = {reviewer: index for index, reviewer in enumerate(unique_reviewers)}
product_to_index = {product: index for index, product in enumerate(unique_products)}

# Convert the 'Reviewer_id' and 'Product_id' columns to NumPy arrays
reviewer_ids = df['Reviewer_id'].apply(lambda x: reviewer_to_index[x]).values
product_ids = df['Product_id'].apply(lambda x: product_to_index[x]).values

ones = np.ones(num_reviews)

adjacency_matrix = csr_matrix((ones, (reviewer_ids, product_ids)), shape=(num_reviewers, num_products))
adjacency_matrix.toarray()

(160225, 923)


In [11]:
# Group the DataFrame by 'user_id' and check if there's any -1 label for each user
# A user is classified as fraudulent (label 1) if any reviews are fradulent 
# Otherwise, a user is classified as non-fraudulent (label 0)
grouped = df.groupby('Reviewer_id')['Label'].apply(lambda x: 1 if -1 in x.values else 0)

# Reset the index to make the user_id a column again
labels = grouped.reset_index()['Label'].to_numpy()
labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
with open("yelpnyc_graph_u2p.pkl", "wb") as file:
    pickle.dump(adjacency_matrix, file)

with open("yelpnyc_labels.pkl", "wb") as file:
    pickle.dump(labels, file)