In [2]:
import json

# Preprocessing data for linear bandits for recommendation systems using Amazon music reviews

In [8]:
# Open the JSON file and read the reviews
with open('Digital_Music_5.json') as f:
    reviews = [json.loads(line) for line in f]


In [9]:
# Printing the first 5 reviews 
print(reviews[:5])

[{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'reviewerName': 'Amaranth "music fan"', 'helpful': [3, 3], 'reviewText': 'It\'s hard to believe "Memory of Trees" came out 11 years ago;it has held up well over the passage of time.It\'s Enya\'s last great album before the New Age/pop of "Amarantine" and "Day without rain." Back in 1995,Enya still had her creative spark,her own voice.I agree with the reviewer who said that this is her saddest album;it is melancholy,bittersweet,from the opening title song."Memory of Trees" is elegaic&majestic.;"Pax Deorum" sounds like it is from a Requiem Mass,it is a dark threnody.Unlike the reviewer who said that this has a "disconcerting" blend of spirituality&sensuality;,I don\'t find it disconcerting at all."Anywhere is" is a hopeful song,looking to possibilities."Hope has a place" is about love,but it is up to the listener to decide if it is romantic,platonic,etc.I\'ve always had a soft spot for this song."On my way home" is a triumphant endi

In [10]:
# Create a dictionary to store the reviews by reviewer ID
reviews_by_reviewer = {}

# Loop through the reviews and add them to the dictionary
for review in reviews:
    reviewer_id = review['reviewerID']
    if reviewer_id in reviews_by_reviewer:
        reviews_by_reviewer[reviewer_id].append(review)
    else:
        reviews_by_reviewer[reviewer_id] = [review]

In [20]:
# Printing the number of reviewers
print("Total number of reviewers : ", len(reviews_by_reviewer))

Total number of reviewers :  5541


In [13]:
# Printing the first 5 items of the dictionary
print(list(reviews_by_reviewer.items())[:5])

[('A3EBHHCZO6V2A4', [{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'reviewerName': 'Amaranth "music fan"', 'helpful': [3, 3], 'reviewText': 'It\'s hard to believe "Memory of Trees" came out 11 years ago;it has held up well over the passage of time.It\'s Enya\'s last great album before the New Age/pop of "Amarantine" and "Day without rain." Back in 1995,Enya still had her creative spark,her own voice.I agree with the reviewer who said that this is her saddest album;it is melancholy,bittersweet,from the opening title song."Memory of Trees" is elegaic&majestic.;"Pax Deorum" sounds like it is from a Requiem Mass,it is a dark threnody.Unlike the reviewer who said that this has a "disconcerting" blend of spirituality&sensuality;,I don\'t find it disconcerting at all."Anywhere is" is a hopeful song,looking to possibilities."Hope has a place" is about love,but it is up to the listener to decide if it is romantic,platonic,etc.I\'ve always had a soft spot for this song."On my way home" 

The format of the dictionary is key: reviewerID, value: [review1, review2, ...]

## Step 1: removing users with insufficient reviews

It's possible that some users have only reviewed a few items, which makes it hard to estimate their true preferences. We can filter out users who have reviewed fewer than a certain number of items (e.g. 20) to ensure that we have enough data to estimate their preferences reliably.

In [25]:
# Filter out reviewers with fewer than a constant number of reviews
MIN_NUM_REVIEWS = 20
reviews_by_reviewer_filtered_by_nb_of_reviews = {k:v for k,v in reviews_by_reviewer.items() if len(v) >= MIN_NUM_REVIEWS}

In [26]:
# Printing the number of reviewers after filtering by number of reviews
print("Total number of reviewers with more than " + str(MIN_NUM_REVIEWS) + " reviews : " + str(len(reviews_by_reviewer_filtered_by_nb_of_reviews)))

Total number of reviewers with more than 20 reviews : 596


In [27]:
# Printing the first 5 items of the dictionary
print(list(reviews_by_reviewer_filtered_by_nb_of_reviews.items())[:5])



## Step 2: removing users with low variance

If a user always gives high ratings (e.g., 4-5 stars), their reviews might not be informative for distinguishing between items that they like more or less. We can filter out reviews with low variance in their ratings to ensure that you focus on the items that the user is more likely to have a preference for.


In [45]:
# Filter out reviewers with fewer than a constant number of variance
MIN_VARIANCE = 2.25

# Create a dictionary to store the filtered dictionary
reviews_by_reviewer_filtered_by_variance = {}

# Loop through the reviews and add them to the dictionary
for key in reviews_by_reviewer_filtered_by_nb_of_reviews:
    reviewer_id = key
    reviews_list = reviews_by_reviewer_filtered_by_nb_of_reviews[reviewer_id]
    n = len(reviews_list)

    # Calculating the average of all ratings for this particular reviewer
    avg = 0.
    for review in reviews_list:
        review_rating = review['overall']
        avg += review_rating
    avg = avg / float(n)

    # Calculating the variance for this particular reviewer
    variance = 0.
    for review in reviews_list:
        review_rating = review['overall']
        variance += (review_rating - avg)**2
    if(n > 1):
        variance /= n - 1

    # Adding this reviewer with their review to the dictionary if enough variance
    if(variance >= MIN_VARIANCE):
        reviews_by_reviewer_filtered_by_variance[reviewer_id] = reviews_list

In [46]:
# Printing the number of reviewers after filtering by variance
print("Total number of reviewers with more than " + str(MIN_NUM_REVIEWS) + " reviews and " + str(MIN_VARIANCE) + " variance : " + str(len(reviews_by_reviewer_filtered_by_variance)))

Total number of reviewers with more than 20 reviews and 2.25 variance : 35


In [47]:
# Printing the first 5 items of the dictionary
print(list(reviews_by_reviewer_filtered_by_variance.items())[:5])

