In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import spacy
import nltk
import re
import json
import pandas as pd
import os
import numpy as np

# Read Data

In [3]:
# Load original dataset
dir_path = '../Dataset/ratebeer/original_dataset'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_no_duplicate.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_no_duplicate.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
400000 lines loaded.
500000 lines loaded.
600000 lines loaded.
700000 lines loaded.
800000 lines loaded.
900000 lines loaded.
Finish loading train dataset, totally 918465 lines.
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
60000 lines loaded.
70000 lines loaded.
80000 lines loaded.
90000 lines loaded.
100000 lines loaded.
110000 lines loaded.
Finish loading test dataset, totally 119213 lines.


## Convert list data to pandas dataframe

In [4]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [5]:
df_train_data

Unnamed: 0,item,user,rating,review
0,652,31,10,poured in a sn pint glass : small fizzy fading...
1,1248,31,16,clear ruby red hue with a fizzy and fully dimi...
2,726,31,17,copper hue with a light orange base and a smal...
3,1160,31,9,bottle and served in my expedition snifter : p...
4,564,31,8,clear straw hue with small head and no lacing ...
...,...,...,...,...
918460,20,3849,7,why do so many people like this beer ? i tried...
918461,269,3849,10,"a good malt liquor , undeniably , but it was l..."
918462,859,3849,2,how this beer is actually better than regular ...
918463,4545,3849,3,the only good thing about this beer is that it...


In [6]:
train_review[0][3]

'poured in a sn pint glass : small fizzy fading head with minimal lace , golden hue . the aroma has a nice lemony grain aspect with a touch of grassy hops , no notice of corn or butterscotch which is a good thing . the flavor is simple , a mild citric twang on top of some sweet and herbal character . the feel is lacking , should be crisper imo . while no noticeable flaws are present , it lacks any defining character on the palate .'

In [7]:
for idx, row in df_train_data.iterrows():
    print(row['item'])
    print(type(row['item']))
    break

652
<class 'str'>


## Aggregate all the relevant sentences for each user/item in the train dataset

## Filter User/Item

In [8]:
"""
The requirement of being a satisfactory dataframe is that for each user / item in the dataset, 
there should be at least 20 reviews and no more than 500 reviews.
"""
lower_thres = 30
upper_thres = 300
def is_satisfy_df(df_review):
    # group by user
    group_by_user = df_review.groupby('user')
    user_selected = set()
    user_review_morethanupper = set()
    user_review_lessthanlower = set()
    for user_df_chunk in list(group_by_user):
        user_id = int(user_df_chunk[0])
        user_df = user_df_chunk[1]
        if len(user_df) > upper_thres:
            user_review_morethanupper.add(user_id)
        elif len(user_df) < lower_thres:
            user_review_lessthanlower.add(user_id)
        else:
            user_selected.add(user_id)
    print("user_selected: {0} \t user(>upper): {1} \t user(<lower): {2}".format(len(user_selected), len(user_review_morethanupper), len(user_review_lessthanlower)))
    # group by item
    group_by_item = df_review.groupby('item')
    item_selected = set()
    item_review_morethanupper = set()
    item_review_lessthanlower = set()
    for item_df_chunk in list(group_by_item):
        item_id = int(item_df_chunk[0])
        item_df = item_df_chunk[1]
        if len(item_df) > upper_thres:
            item_review_morethanupper.add(item_id)
        elif len(item_df) < lower_thres:
            item_review_lessthanlower.add(item_id)
        else:
            item_selected.add(item_id)
    print("item_selected: {0} \t item(>upper): {1} \t item(<lower): {2}".format(len(item_selected), len(item_review_morethanupper), len(item_review_lessthanlower)))
    # whether this dataframe has rare/popular user/item
    if len(user_review_morethanupper) == 0 and len(user_review_lessthanlower) == 0:
        if len(item_review_morethanupper) == 0 and len(item_review_lessthanlower) == 0:
            return True
        else:
            return False
    else:
        return False

In [9]:
iter_cnt = 0
while True:
    # group by user
    group_by_user = df_train_data.groupby('user')
    # Loop through all the dataframe for each user
    user_selected = set()
    user_review_morethan500 = set()
    user_review_lessthan20 = set()
    user_num_review = []
    for user_df_chunk in list(group_by_user):
        user_id = int(user_df_chunk[0])
        user_df = user_df_chunk[1]
        user_num_review.append(len(user_df))
        if len(user_df) > upper_thres:
            user_review_morethan500.add(user_id)
        elif len(user_df) < lower_thres:
            user_review_lessthan20.add(user_id)
        else:
            user_selected.add(user_id)
    print("Start Drop")
    # Filter user
    # for idx, row in df_train_data.iterrows():
    #     if int(row['user']) not in user_selected:
    #         df_train_data.drop(idx, inplace=True)
    cur_train_review = []
    for row_data in train_review:
        # if the user_id is in the selected user set or not
        if int(row_data[1]) in user_selected:
            cur_train_review.append(row_data)
    train_review = cur_train_review
    # convert list to dataframe
    df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
    # check whether this dataframe satisfies the requirement
    if is_satisfy_df(df_train_data):
        break
    print('Finish User')
    # else, keep going
    # group by item
    group_by_item = df_train_data.groupby('item')
    # Loop through all the dataframe for each item
    item_selected = set()
    item_review_morethan500 = set()
    item_review_lessthan20 = set()
    item_num_review = []
    for item_df_chunk in list(group_by_item):
        item_id = int(item_df_chunk[0])
        item_df = item_df_chunk[1]
        item_num_review.append(len(item_df))
        if len(item_df) > upper_thres:
            item_review_morethan500.add(item_id)
        elif len(item_df) < lower_thres:
            item_review_lessthan20.add(item_id)
        else:
            item_selected.add(item_id)
    print("Start Drop")
    # Filter item
    # for idx, row in df_train_data.iterrows():
    #     if int(row['item']) not in item_selected:
    #         df_train_data.drop(idx, inplace=True)
    cur_train_review = []
    for row_data in train_review:
        # if the item_id is in the selected user set or not
        if int(row_data[0]) in item_selected:
            cur_train_review.append(row_data)
    train_review = cur_train_review
    # convert list to dataframe
    df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
    # check whether this dataframe satisfies the requirement
    if is_satisfy_df(df_train_data):
        break
    print('Finish Item')
    iter_cnt += 1
    if iter_cnt % 10 == 0:
        print("{} iterations of filter".format(iter_cnt))


Start Drop
user_selected: 2728 	 user(>upper): 0 	 user(<lower): 0
item_selected: 1704 	 item(>upper): 179 	 item(<lower): 5089
Finish User
Start Drop
user_selected: 1719 	 user(>upper): 0 	 user(<lower): 1009
item_selected: 1704 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 1719 	 user(>upper): 0 	 user(<lower): 0
item_selected: 1511 	 item(>upper): 0 	 item(<lower): 193
Finish User
Start Drop
user_selected: 1671 	 user(>upper): 0 	 user(<lower): 48
item_selected: 1511 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 1671 	 user(>upper): 0 	 user(<lower): 0
item_selected: 1493 	 item(>upper): 0 	 item(<lower): 18
Finish User
Start Drop
user_selected: 1665 	 user(>upper): 0 	 user(<lower): 6
item_selected: 1493 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 1665 	 user(>upper): 0 	 user(<lower): 0
item_selected: 1490 	 item(>upper): 0 	 item(<lower): 3
Finish User
Start Drop
user_selected: 1664 	 user(>uppe

In [10]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
117813,779,2624,17,"i beer that you can sit outside , slap your wi..."
117814,325,2624,18,"updated : apr 14 , 2004 not bad to drink , i l..."
117815,447,2624,18,"updated : apr 14 , 2004 a good pale ale . i wa..."
117816,444,2624,2,too lemony for my taste but lived near leinenk...


### Check Whether the Filtered Dataframe Satisfies the Requirement

In [11]:
# Check User
# group by user
group_by_user = df_train_data.groupby('user')
user_selected = set()
user_review_morethan500 = set()
user_review_lessthan20 = set()
user_num_reviews = []
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_df = user_df_chunk[1]
    if len(user_df) > upper_thres:
        user_review_morethan500.add(user_id)
    elif len(user_df) < lower_thres:
        user_review_lessthan20.add(user_id)
    else:
        user_selected.add(user_id)
    user_num_reviews.append(len(user_df))


In [12]:
print("Number of user selected: {}".format(len(user_selected)))
print("Number of user with more than upper reviews: {}".format(len(user_review_morethan500)))
print("Number of user with less than lower reviews: {}".format(len(user_review_lessthan20)))

Number of user selected: 1664
Number of user with more than upper reviews: 0
Number of user with less than lower reviews: 0


In [13]:
print("Mean number of reviews per user: {}".format(np.mean(user_num_reviews)))
print("Max number of reviews per user: {}".format(np.max(user_num_reviews)))
print("Min number of reviews per user: {}".format(np.min(user_num_reviews)))

Mean number of reviews per user: 70.80408653846153
Max number of reviews per user: 182
Min number of reviews per user: 30


In [14]:
# Check Item
# group by item
group_by_item = df_train_data.groupby('item')
item_selected = set()
item_review_morethan500 = set()
item_review_lessthan20 = set()
item_num_reviews = []
for item_df_chunk in list(group_by_item):
    item_id = int(item_df_chunk[0])
    item_df = item_df_chunk[1]
    if len(item_df) > upper_thres:
        item_review_morethan500.add(item_id)
    elif len(item_df) < lower_thres:
        item_review_lessthan20.add(item_id)
    else:
        item_selected.add(item_id)
    item_num_reviews.append(len(item_df))


In [15]:
print("Number of item selected: {}".format(len(item_selected)))
print("Number of item with more than upper reviews: {}".format(len(item_review_morethan500)))
print("Number of item with less than lower reviews: {}".format(len(item_review_lessthan20)))

Number of item selected: 1490
Number of item with more than upper reviews: 0
Number of item with less than lower reviews: 0


In [16]:
print("Mean number of reviews per item: {}".format(np.mean(item_num_reviews)))
print("Max number of reviews per item: {}".format(np.max(item_num_reviews)))
print("Min number of reviews per item: {}".format(np.min(item_num_reviews)))

Mean number of reviews per item: 79.0724832214765
Max number of reviews per item: 263
Min number of reviews per item: 30


In [17]:
is_satisfy_df(df_train_data)

user_selected: 1664 	 user(>upper): 0 	 user(<lower): 0
item_selected: 1490 	 item(>upper): 0 	 item(<lower): 0


True

In [18]:
for idx, row in df_train_data.iterrows():
    print(row)
    print(row['item'])
    print(type(row['item']))
    print(type(row['user']))
    print(type(row['rating']))
    print(type(row['review']))
    break

item                                                    196
user                                                    999
rating                                                   17
review    dark brown body with a light brown head . nutt...
Name: 0, dtype: object
196
<class 'str'>
<class 'str'>
<class 'int'>
<class 'str'>


## Save the Filtered Train Set into Json File

In [20]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
117813,779,2624,17,"i beer that you can sit outside , slap your wi..."
117814,325,2624,18,"updated : apr 14 , 2004 not bad to drink , i l..."
117815,447,2624,18,"updated : apr 14 , 2004 a good pale ale . i wa..."
117816,444,2624,2,too lemony for my taste but lived near leinenk...


In [21]:
# Save the dataframe to json file
with open('../Dataset/ratebeer/medium/train_review_filtered.json', 'w') as f1:
    for idx, row in df_train_data.iterrows():
        row_dict = {'user': row['user'], 'item': row['item'], 'rating': row['rating'], 'review': row['review']}
        # dump this dict into file
        json.dump(row_dict, f1)
        f1.write("\n")

## Filter Test set

In [22]:
df_test_data

Unnamed: 0,item,user,rating,review
0,922,31,11,bottle : clean yellow hue topped by a lasting ...
1,736,31,15,bottle and served in my big ole karmeliet glas...
2,65,31,12,"cloudy orange hue , almost no head . hoppy nos..."
3,2175,31,15,bottle and served in my stone irs snifter : in...
4,3929,31,15,deep brown hue with fizzy off - white head and...
...,...,...,...,...
119208,538,3849,12,now this shit here is my bread and butter . ol...
119209,571,3849,2,"well , colt 45 earns credit for at least one t..."
119210,21,3849,3,i ca nt believe i was this generous to this be...
119211,2156,3849,7,"the thing about stock though , i think it is a..."


In [23]:
len(test_review)

119213

In [24]:
test_review_filter = []
for test_data in test_review:
    item_id = int(test_data[0])
    user_id = int(test_data[1])
    if user_id in user_selected and item_id in item_selected:
        # add this into the filtered test set
        test_review_filter.append(test_data)

In [25]:
len(test_review_filter)

14677

In [26]:
df_test_data_filter = pd.DataFrame(test_review_filter, columns=['item', 'user', 'rating', 'review'])

In [27]:
df_test_data_filter

Unnamed: 0,item,user,rating,review
0,760,999,14,bomber pours a hazy caramel hued body with a s...
1,842,999,14,bottle pours ahazy apricot body with a small o...
2,442,999,16,"pours a hazy , dark caramel body with no head ..."
3,274,999,16,hazed peach body supports a small offwhite hea...
4,476,999,15,picked up a couple bottles of these at a local...
...,...,...,...,...
14672,1860,2624,7,sour apple taste that left a terrbile aftermath .
14673,630,2624,18,great tasting beer . goes with any type of foo...
14674,729,2624,16,has a red looking appearance that has a taste ...
14675,1553,2624,9,"i have tasted worse , so this beer is nt too b..."


### Save Filtered Test Set into Json File

In [28]:
# Save the dataframe to json file
with open('../Dataset/ratebeer/medium/test_review_filtered.json', 'w') as f1:
    for idx, row in df_test_data_filter.iterrows():
        row_dict = {'user': row['user'], 'item': row['item'], 'rating': row['rating'], 'review': row['review']}
        # dump this dict into file
        json.dump(row_dict, f1)
        f1.write("\n")