In [21]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
import pickle

In [22]:
df = pd.read_csv(Path.cwd() / "yelpnyc" / "metadata.csv")
df

Unnamed: 0,Reviewer_id,Product_id,Rating,Label,Date
0,923,0,3,-1,12/8/2014
1,924,0,3,-1,5/16/2013
2,925,0,4,-1,7/1/2013
3,926,0,4,-1,7/28/2011
4,927,0,4,-1,11/1/2010
...,...,...,...,...,...
359047,161146,349,5,1,2/6/2014
359048,116424,349,5,1,1/31/2014
359049,161147,349,5,1,1/30/2014
359050,97930,349,5,1,1/25/2014


In [23]:
# Create a mapping of unique reviewers and products
unique_reviewers = np.sort(df['Reviewer_id'].unique())
unique_products = np.sort(df['Product_id'].unique())

num_reviews = len(df)
num_reviewers = len(unique_reviewers)
num_products = len(unique_products)

print(f"{num_reviews} reviews")
print(f"{num_reviewers} reviewers")
print(f"{num_products} products")

359052 reviews
160225 reviewers
923 products


In [24]:
# Create dictionaries to map reviewers and products to indices
reviewer_to_index = {reviewer: index for index, reviewer in enumerate(unique_reviewers)}
product_to_index = {product: index for index, product in enumerate(unique_products)}

# Convert the 'Reviewer_id' and 'Product_id' columns to NumPy arrays
reviewer_ids = df['Reviewer_id'].apply(lambda x: reviewer_to_index[x]).values
product_ids = df['Product_id'].apply(lambda x: product_to_index[x]).values

ones = np.ones(num_reviews)

adjacency_matrix = csr_matrix((ones, (reviewer_ids, product_ids)), shape=(num_reviewers, num_products))
adjacency_matrix.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:

# Create dictionaries to map reviewers and products to indices
reviewer_to_index = {reviewer: index for index, reviewer in enumerate(unique_reviewers)}
product_to_index = {product: index for index, product in enumerate(unique_products)}

# Convert the 'Reviewer_id' and 'Product_id' columns to NumPy arrays
reviewer_ids = df['Reviewer_id'].apply(lambda x: reviewer_to_index[x]).values
product_ids = df['Product_id'].apply(lambda x: product_to_index[x]).values
ratings = df['Rating'].values  # Get the ratings

# Create the CSR matrix with the ratings
ratings_matrix = csr_matrix((ratings, (reviewer_ids, product_ids)), shape=(num_reviewers, num_products))
ratings_matrix.toarray()

array([[3, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
d = df.groupby('Product_id')['Rating'].mean().to_dict()

print(d.values())

average_ratings = np.array(list(d.values()))

dict_values([4.0095238095238095, 4.32806324110672, 3.8433734939759034, 4.169491525423729, 4.03290676416819, 4.372093023255814, 4.200787401574803, 3.7518463810930576, 3.876543209876543, 3.842696629213483, 4.473895582329317, 4.0, 3.727810650887574, 4.022471910112359, 3.908256880733945, 3.9390243902439024, 3.9730941704035874, 4.13768115942029, 3.9198717948717947, 4.037151702786378, 4.322033898305085, 4.366666666666666, 4.23404255319149, 4.195945945945946, 4.0, 4.076923076923077, 4.436936936936937, 3.8192090395480225, 3.5785123966942147, 3.9537037037037037, 4.4375, 4.435483870967742, 3.9008620689655173, 4.176404494382022, 4.109090909090909, 4.202898550724638, 3.9936102236421727, 4.023809523809524, 3.806970509383378, 3.7075892857142856, 4.312868949232586, 4.146435452793834, 3.7058823529411766, 3.8003613369467026, 3.931854199683043, 3.7948717948717947, 4.049689440993789, 4.2368421052631575, 3.985, 4.735294117647059, 4.351713859910581, 3.8004201680672267, 3.9679098005203817, 4.313333333333333

In [27]:
# Group the DataFrame by 'user_id' and check if there's any -1 label for each user
# A user is classified as fraudulent (label 1) if any reviews are fradulent 
# Otherwise, a user is classified as non-fraudulent (label 0)
grouped = df.groupby('Reviewer_id')['Label'].apply(lambda x: 1 if -1 in x.values else 0)

# Reset the index to make the user_id a column again
labels = grouped.reset_index()['Label'].to_numpy()
proportion_fraudulent = labels.mean()
print(f"Proportion of fraudulent users: {proportion_fraudulent}")

labels

Proportion of fraudulent users: 0.1778498985801217


array([1, 1, 1, ..., 0, 0, 0])

In [28]:
with open("yelpnyc_graph_u2p.pkl", "wb") as file:
    pickle.dump(adjacency_matrix, file)

with open("yelpnyc_ratings.pkl", "wb") as file:
    pickle.dump(ratings_matrix, file)

with open("yelpnyc_avg_ratings.pkl", "wb") as file:
    pickle.dump(average_ratings, file)

with open("yelpnyc_labels.pkl", "wb") as file:
    pickle.dump(labels, file)