In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import spacy
import nltk
import re
import json
import pandas as pd
import os
import numpy as np
from nltk.tokenize import sent_tokenize

# Load Data

In [4]:
# Load Data from the original dataset
dir_path = '../Dataset/ratebeer/original_dataset'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
400000 lines loaded.
500000 lines loaded.
600000 lines loaded.
700000 lines loaded.
800000 lines loaded.
900000 lines loaded.
Finish loading train dataset, totally 939044 lines.
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
60000 lines loaded.
70000 lines loaded.
80000 lines loaded.
90000 lines loaded.
100000 lines loaded.
110000 lines loaded.
Finish loading test dataset, totally 119576 lines.


# Check Review with 0 sentences

In [5]:
cnt = 0
for train_data_instance in train_review:
    review_text = train_data_instance[-1]
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text == ''
        cnt += 1

In [6]:
cnt

0

In [9]:
cnt = 0
for test_data_instance in test_review:
    review_text = test_data_instance[-1]
    if review_text == '':
        cnt += 1

In [10]:
cnt

0

# Check Duplicate Reviews

In [13]:
# in train set
trainset_user_item_set = set()
cnt_duplicate_review = 0

for train_data_chunk in train_review:
    item_id = str(train_data_chunk[0])
    user_id = str(train_data_chunk[1])
    if (user_id, item_id) in trainset_user_item_set:
        cnt_duplicate_review += 1
    else:
        trainset_user_item_set.add((user_id, item_id))

print("[Train] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Train] Number of unique reviews: {}".format(len(trainset_user_item_set)))

[Train] Number of duplicate reviews: 20579
[Train] Number of unique reviews: 918465


In [20]:
# in test set
testset_user_item_set = set()
cnt_duplicate_review = 0

for test_data_chunk in test_review:
    item_id = str(test_data_chunk[0])
    user_id = str(test_data_chunk[1])
    if (user_id, item_id) in testset_user_item_set:
        cnt_duplicate_review += 1
    else:
        testset_user_item_set.add((user_id, item_id))

print("[Test] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Test] Number of unique reviews: {}".format(len(testset_user_item_set)))

[Test] Number of duplicate reviews: 363
[Test] Number of unique reviews: 119213


In [12]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [17]:
# make sure that duplicate reviews has the same rating and review text
# goupby multiple columns (user and item)
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt_duplicate_review = 0
cnt_user_item_pair = 0
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    cnt_user_item_pair += 1
    if len(cur_df_user_item) > 1:
        rating_list = list(cur_df_user_item['rating'])
        review_text_list = list(cur_df_user_item['review'])
        cnt_duplicate_review += len(cur_df_user_item) - 1
        for i in range(len(cur_df_user_item)):
            assert rating_list[i] == rating_list[0]
            if review_text_list[i] == review_text_list[0]:
                pass
            else:
                print(cur_df_user_item)
print("Total unique user-item pair: {}".format(cnt_user_item_pair))
print("Total duplicate reviews: {}".format(cnt_duplicate_review))

        item user  rating                                             review
324642  1637  103      14  good head , creamy , over copper - chestnut be...
325261  1637  103      14  good head , creamy , over copper - chest nut b...
       item  user  rating                                             review
557159  361  1139      12  pours golden with fizzy white head that quickl...
557181  361  1139      12  pours golden with fizzy white head that quickl...
        item user  rating                                             review
192136  2242  116      16  i have belgianprincess on ba to thank for this...
192819  2242  116      16  i have belgianprincess on ba to thank for this...
       item user  rating                                             review
266823  520  167      14  updated : aug 30 , 2010 gabf 2009 , day 1 : da...
267042  520  167      14  updated : aug 30 , 2010 gabf 2009 , day 1 : da...
        item  user  rating                                             review
8

# Filter those Data with 0 sentences
Make sure that there are no review in training/testing set that contain an empty review text.

In [18]:
with open('../Dataset/ratebeer/original_dataset/train_no_duplicate.json', 'w') as f1:
    user_item_set = set()
    cnt_reviews = 0
    for row in train_review:
        text = row[-1]
        user_id = row[1]
        item_id = row[0]
        if text == '':
            continue
        elif (user_id, item_id) in user_item_set:
            continue
        else:
            user_item_set.add((user_id, item_id))
            row_dict = {'user': row[1], 'item': row[0], 'rating': row[2], 'review': row[-1]}
            cnt_reviews += 1
            # dump this dict into file
            json.dump(row_dict, f1)
            f1.write("\n")
print("[Train] number of reviews in train set: {}".format(cnt_reviews))

[Train] number of reviews in train set: 918465


In [21]:
with open('../Dataset/ratebeer/original_dataset/test_no_duplicate.json', 'w') as f1:
    user_item_set = set()
    cnt_reviews = 0
    for row in test_review:
        text = row[-1]
        user_id = row[1]
        item_id = row[0]
        if text == '':
            continue
        elif (user_id, item_id) in user_item_set:
            continue
        else:
            user_item_set.add((user_id, item_id))
            row_dict = {'user': row[1], 'item': row[0], 'rating': row[2], 'review': row[-1]}
            # dump this dict into file
            cnt_reviews += 1
            json.dump(row_dict, f1)
            f1.write("\n")
print("[Test] number of reviews in train set: {}".format(cnt_reviews))

[Test] number of reviews in train set: 119213
