In [1]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LogisticRegression, LinearRegression
import sklearn
import numpy as np
import pandas as pd
import random
import gzip
import math

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the gzipped JSONL file
df = pd.read_json("Amazon_Fashion.jsonl.gz", lines=True, compression='gzip')


In [4]:
duplicates_before = df[df.duplicated(subset=['user_id', 'asin'], keep=False)]

# Print duplicate rows
print(len(duplicates_before))

46771


In [5]:
# we drop all duplicates and only keep the newest review

df_unique = df.sort_values(by=['user_id', 'asin', 'timestamp'], ascending=[True, True, False])
df_unique = df.drop_duplicates(subset=['user_id', 'asin'], keep='first')


duplicates_after = df_unique[df_unique.duplicated(subset=['user_id', 'asin'], keep=False)]
print(len(duplicates_after))


0


In [4]:
df.head()


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5,Pretty locket,I think this locket is really pretty. The insi...,[],B00LOPVX74,B00LOPVX74,AGBFYI2DDIKXC5Y4FARTYDTQBMFQ,2020-01-09 00:06:34.489,3,True
1,5,A,Great,[],B07B4JXK8D,B07B4JXK8D,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-20 01:04:06.701,0,True
2,2,Two Stars,One of the stones fell out within the first 2 ...,[],B007ZSEQ4Q,B007ZSEQ4Q,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,2015-05-23 01:33:48.000,3,True
3,1,Won’t buy again,Crappy socks. Money wasted. Bought to wear wit...,[],B07F2BTFS9,B07F2BTFS9,AFVNEEPDEIH5SPUN5BWC6NKL3WNQ,2018-12-31 20:57:27.095,2,True
4,5,I LOVE these glasses,I LOVE these glasses! They fit perfectly over...,[],B00PKRFU4O,B00XESJTDE,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2015-08-13 14:29:26.000,0,True


In [5]:
# Optional: Inspect the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500939 entries, 0 to 2500938
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   rating             int64         
 1   title              object        
 2   text               object        
 3   images             object        
 4   asin               object        
 5   parent_asin        object        
 6   user_id            object        
 7   timestamp          datetime64[ns]
 8   helpful_vote       int64         
 9   verified_purchase  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 174.1+ MB
None


In [6]:
# number of reviews
length = len(df)
print(length)


2500939


In [7]:
missing_values = df.isna().sum()
print(missing_values)

rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64


In [8]:
# number of users

number_of_users = len(df["user_id"].drop_duplicates())
print(number_of_users)


2035490


In [14]:
# number of items

number_of_products = len(df["asin"].drop_duplicates())
print(number_of_products)

874297


In [9]:
# mean/median number of reviews per user

number_of_reviews_per_user = df.groupby("user_id").size().reset_index(name='review_count')

mean_number_of_reviews_per_user = number_of_reviews_per_user["review_count"].mean()
median_number_of_reviews_per_user = number_of_reviews_per_user["review_count"].median()
std_number_of_reviews_per_user = number_of_reviews_per_user["review_count"].std()


print(mean_number_of_reviews_per_user)
print(median_number_of_reviews_per_user)
print(std_number_of_reviews_per_user)

1.2286668075008966
1.0
0.9318106540913449


In [13]:
# how often each rating score has been given

number_of_reviews_per_rating = df.groupby("rating").size().reset_index(name='review_count')
number_of_reviews_per_rating["review_count_percentage"] = number_of_reviews_per_rating["review_count"].apply(lambda x: (x / length) * 100)
number_of_reviews_per_rating

Unnamed: 0,rating,review_count,review_count_percentage
0,1,303411,12.131883
1,2,173462,6.935875
2,3,245471,9.815153
3,4,352327,14.087789
4,5,1426268,57.0293


In [11]:
# build clusters for super users: e.g., how many users give 0-5 ratings? How many 6-15? How many >15?

number_of_users_with_n_reviews = number_of_reviews_per_user.groupby("review_count").size().reset_index(name='user_count')
number_of_users_with_n_reviews["user_count_relative"] = number_of_users_with_n_reviews["user_count"].apply(lambda x: (x / number_of_users) * 100)

first_10 = number_of_users_with_n_reviews.iloc[:10]
rest = number_of_users_with_n_reviews.iloc[10:]

aggregated_rest = number_of_users_with_n_reviews.iloc[10:].sum().to_frame().T
aggregated_rest.loc[0, "review_count"] = "> 10"
result_df = pd.concat([first_10, aggregated_rest], ignore_index=True)

result_df

Unnamed: 0,review_count,user_count,user_count_relative
0,1,1736706.0,85.321274
1,2,218103.0,10.715012
2,3,48081.0,2.362134
3,4,17080.0,0.83911
4,5,6508.0,0.319726
5,6,3441.0,0.16905
6,7,1695.0,0.083272
7,8,1095.0,0.053795
8,9,681.0,0.033456
9,10,477.0,0.023434


In [24]:
# how many products have 0-5 ratings? How many 6-15? How many >15?

number_of_reviews_per_product = df.groupby("asin").size().reset_index(name='review_count')
number_of_products_with_n_reviews = number_of_reviews_per_product.groupby("review_count").size().reset_index(name='product_count')
number_of_products_with_n_reviews["product_count_relative"] = number_of_products_with_n_reviews["product_count"].apply(lambda x: (x / number_of_products) * 100)

first_10_products = number_of_products_with_n_reviews.iloc[:10]
rest_products = number_of_products_with_n_reviews.iloc[10:]

aggregated_rest_products = number_of_products_with_n_reviews.iloc[10:].sum().to_frame().T
aggregated_rest_products.loc[0, "review_count"] = "> 10"
result_df_products = pd.concat([first_10_products, aggregated_rest_products], ignore_index=True)

result_df_products

Unnamed: 0,review_count,product_count,product_count_relative
0,1,518437.0,59.297584
1,2,151670.0,17.347652
2,3,67152.0,7.680685
3,4,36427.0,4.166433
4,5,22806.0,2.608496
5,6,15027.0,1.718752
6,7,10990.0,1.25701
7,8,8104.0,0.926916
8,9,6176.0,0.706396
9,10,4857.0,0.555532


In [12]:
# review length - mean/median/std

review_texts = df["text"].reset_index(name='text')
review_texts["text_length"] = review_texts["text"].apply(lambda x: len(x))
mean_review_length = review_texts["text_length"].mean()
median_review_length = review_texts["text_length"].median()
std_review_length = review_texts["text_length"].std()


print(mean_review_length)
print(median_review_length)
print(std_review_length)

142.17913911534828
87.0
188.1751081216233


In [13]:
# number of reviews w/o text
number_of_reviews_without_text = len(df[df["text"].str.strip() == ""])
number_of_reviews_without_text

2426

In [14]:
# rating - mean/median/std
ratings = df["rating"]
ratings_mean = ratings.mean()
ratings_median = ratings.median()
ratings_std = ratings.std()

print(ratings_mean)
print(ratings_median)
print(ratings_std)

3.969467468019012
5.0
1.4271711881871694


In [15]:
# percentage of verified purchases

percentage_of_verified_purchases = len(df[df["verified_purchase"] == True]) / length
print(percentage_of_verified_purchases)

0.9347297155188511


In [16]:
# correlation between verified purchases and rating (do verified raters give better ratings? Are unverified raters only here to spread hate?)

# a) no downsampling
rating_verified_purchase = df[["rating", "verified_purchase"]]
rating_verified_purchase["verified_purchase"] = rating_verified_purchase["verified_purchase"].apply(lambda x: int(x))
correlation_rating_verified_purchase = rating_verified_purchase['rating'].corr(rating_verified_purchase['verified_purchase'])
print(correlation_rating_verified_purchase)

0.0022662465923941302


In [17]:
# b) downsampling

rating_verified_purchase = df[["rating", "verified_purchase"]]
rating_verified_purchase["verified_purchase"] = rating_verified_purchase["verified_purchase"].apply(lambda x: int(x))

uverified_purchases = df[df["verified_purchase"] == 0]
sampled_verified_purchases = df[df["verified_purchase"] == 1].sample(len(uverified_purchases), random_state=42)
downsampled_df = pd.concat([sampled_verified_purchases, uverified_purchases], ignore_index=True)
correlation_rating_verified_purchase = downsampled_df['rating'].corr(downsampled_df['verified_purchase'])
print(correlation_rating_verified_purchase)



0.005403148978335591


In [18]:
# c) comparisong of mean rating

print(uverified_purchases["rating"].mean())
print(sampled_verified_purchases["rating"].mean())

3.957227834375785
3.972506233268193


In [19]:
# correlation between reviews' author's rating activity (number of reviews) and number of helpful votes?

helpful_reviewcount = df[["user_id", "helpful_vote"]].merge(number_of_reviews_per_user, how="left", left_on="user_id", right_on="user_id")
helpful_reviewcount_correlation = helpful_reviewcount['review_count'].corr(downsampled_df['helpful_vote'])
print(helpful_reviewcount_correlation)

-0.0020771991003059108


In [33]:
# do reviews with images tend to yield in more helpful votes?
images_helpful = df[["images", "helpful_vote"]]
images_helpful["len_images"] = df["images"].apply(lambda x: len(x))
images_helpful_correlation = images_helpful["len_images"].corr(images_helpful["helpful_vote"])
print(images_helpful_correlation)

0.14635963076591357
