In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize

In [3]:
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation

In [4]:
dataset_name = 'large_500'

# Read Data

## Load Dataset

In [5]:
dir_path = '../Dataset/ratebeer/{}'.format(dataset_name)
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
Finish loading test dataset, totally 40730 lines.


In [6]:
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [7]:
df_test_data

Unnamed: 0,item,user,rating,review
0,760,999,14,bomber pours a hazy caramel hued body with a s...
1,842,999,14,bottle pours ahazy apricot body with a small o...
2,442,999,16,"pours a hazy , dark caramel body with no head ..."
3,274,999,16,hazed peach body supports a small offwhite hea...
4,476,999,15,picked up a couple bottles of these at a local...
...,...,...,...,...
40725,325,3849,1,"this beer is not worth my time to rate it , so..."
40726,446,3849,5,"really not terrible for the price , but has th..."
40727,538,3849,12,now this shit here is my bread and butter . ol...
40728,571,3849,2,"well , colt 45 earns credit for at least one t..."


## Load Sentence2ID and ID2Sentence Mapping From Training Set

In [8]:
with open('../Dataset/ratebeer/{}/train/sentence/sentence2id.json'.format(dataset_name), 'r') as f:
    trainset_sent_to_id = json.load(f)

In [9]:
type(trainset_sent_to_id['bottle at home .'])
# the id here is str

str

In [10]:
with open('../Dataset/ratebeer/{}/train/sentence/id2sentence.json'.format(dataset_name), 'r') as f:
    trainset_id_to_sent = json.load(f)

In [11]:
trainset_id_to_sent['0']
# the id here is str

'bottle at home .'

In [12]:
assert len(trainset_sent_to_id) == len(trainset_id_to_sent)
print("There are {} sentences in the training set.".format(len(trainset_id_to_sent)))

There are 1246458 sentences in the training set.


## Load Feature Words

In [13]:
# Feature words are the same between training and testing
# since we can only know the review text from training set
feature_2_id_file = '../Dataset/ratebeer/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature_2_id_file, 'r') as f:
    feature_vocab = json.load(f)

In [14]:
len(feature_vocab)

2000

In [15]:
feature_word_list = list(feature_vocab.keys())
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 2000


## Load Sentence2Feature

# Build Sentence Vocab on Test set

## Check Whether there are reviews with no sentence

In [16]:
invalid_data = 0
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [17]:
print(invalid_data)

0


In [18]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [19]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    tf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 1000 == 0:
        print("Processed {} lines".format(idx+1))
print("There are {} sentences with no feature words".format(sentence_with_no_feature))

Processed 1000 lines
Processed 2000 lines
Processed 3000 lines
Processed 4000 lines
Processed 5000 lines
Processed 6000 lines
Processed 7000 lines
Processed 8000 lines
Processed 9000 lines
Processed 10000 lines
Processed 11000 lines
Processed 12000 lines
Processed 13000 lines
Processed 14000 lines
Processed 15000 lines
Processed 16000 lines
Processed 17000 lines
Processed 18000 lines
Processed 19000 lines
Processed 20000 lines
Processed 21000 lines
Processed 22000 lines
Processed 23000 lines
Processed 24000 lines
Processed 25000 lines
Processed 26000 lines
Processed 27000 lines
Processed 28000 lines
Processed 29000 lines
Processed 30000 lines
Processed 31000 lines
Processed 32000 lines
Processed 33000 lines
Processed 34000 lines
Processed 35000 lines
Processed 36000 lines
Processed 37000 lines
Processed 38000 lines
Processed 39000 lines
Processed 40000 lines
There are 506 sentences with no feature words


In [20]:
len(sentence_count)

176015

In [21]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [22]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
testset_sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
testset_id_to_sent = {str(id): sent for (sent, id) in testset_sent_to_id.items()}

## Save Sentence to ID into Json File (Test set)

In [24]:
with open('../Dataset/ratebeer/{}/test/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/ratebeer/{}/test/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

with open('../Dataset/ratebeer/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/ratebeer/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)


In [26]:
with open('../Dataset/ratebeer/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'r') as f:
    validset_id_to_sent = json.load(f)

with open('../Dataset/ratebeer/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'r') as f:
    validset_sent_to_id = json.load(f)

In [27]:
len(testset_id_to_sent)

176015

## Load User to Sentence ID

In [28]:
with open('../Dataset/ratebeer/{}/train/user/user2sentids.json'.format(dataset_name), 'r') as f:
    trainset_user_to_sent_id = json.load(f)

## Load Item to Sentence ID

In [29]:
with open('../Dataset/ratebeer/{}/train/item/item2sentids.json'.format(dataset_name), 'r') as f:
    trainset_item_to_sent_id = json.load(f)

In [30]:
len(trainset_user_to_sent_id)

2963

In [31]:
len(trainset_user_to_sent_id['1223'])

140

In [32]:
len(trainset_item_to_sent_id)

3744

In [33]:
len(trainset_item_to_sent_id['199'])

1954

# For Each Data Instance in Testset

## GroupBy User

In [34]:
group_by_user_test = df_test_data.groupby('user')

In [35]:
len(group_by_user_test)

2963

In [36]:
len(list(group_by_user_test))

2963

# Construct Valid Dataset

### Remember in valid set we are doing sampling as what the we did on the train set.

In [37]:
import random
sample_sent_num = 500
user_item_candidate_sent_ids_validset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # construct the candidate set which is an union of user sentence and item sentence
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    user_item_candidate_sent_ids_validset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 500 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
Finish.
Totally 2963 users
Totally 40730 reviews. Among them 1 reviews has empty true label sentence
During constructing, 35679 user-item pair are being cutted due to their length


In [38]:
import numpy as np
print("Totally {} user item pairs in the testset".format(len(user_item_candidate_sentence_num)))
print("max number of candidate sentence: {}".format(np.max(user_item_candidate_sentence_num)))
print("min number of candidate sentence: {}".format(np.min(user_item_candidate_sentence_num)))

Totally 40729 user item pairs in the testset
max number of candidate sentence: 7010
min number of candidate sentence: 83


In [39]:
sorted(user_item_candidate_sentence_num)[-20:]

[5613,
 5633,
 5643,
 5651,
 5694,
 5721,
 5742,
 5756,
 5767,
 5804,
 5894,
 5898,
 5903,
 5928,
 5983,
 6117,
 6118,
 6348,
 6351,
 7010]

In [40]:
len(user_item_candidate_sent_ids_validset)

2963

In [41]:
# save this into json file
with open('../Dataset/ratebeer/{}/valid/useritem2sentids_test.json'.format(dataset_name), 'w') as f:
    json.dump(user_item_candidate_sent_ids_validset, f)

In [42]:
check_user_id = "1988"
check_item_id = "202"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))

user: 1988 	 item: 202
number of sentence in candidate set: 500
number of sentence in true review set: 4


In [43]:
# Checking How Many User/Item/Review are in the test set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        # assert user_id_str in train_user_id_set
        # assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 40729
Total number of user: 2963
Total number of item: 3669


In [44]:
# Write useritem2sentids_test into a line-by-line format
with open('../Dataset/ratebeer/{}/valid/useritem2sentids_test_multilines.json'.format(dataset_name), 'w') as f1:
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

print("Total {} users".format(cnt_user))
print("Totat {} reviews".format(cnt_review))

Total 2963 users
Totat 40729 reviews


# Construct Test Dataset

In [50]:
sample_sent_num = 2600
user_item_candidate_sent_ids_testset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # set union
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            # sample_useritem_sent_ids = cur_user_sent_ids
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    user_item_candidate_sent_ids_testset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 500 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
Finish.
Totally 2963 users
Totally 40730 reviews. Among them 1 reviews has empty true label sentence
During constructing, 4079 user-item pair are being cutted due to their length


In [51]:
print("Totally {} user item pairs in the testset".format(len(user_item_candidate_sentence_num)))
print("max number of candidate sentence: {}".format(np.max(user_item_candidate_sentence_num)))
print("min number of candidate sentence: {}".format(np.min(user_item_candidate_sentence_num)))

Totally 40729 user item pairs in the testset
max number of candidate sentence: 7010
min number of candidate sentence: 83


In [52]:
sorted(user_item_candidate_sentence_num)[-4000]

2611

In [53]:
len(user_item_candidate_sent_ids_testset)

2963

In [54]:
# save this into json file
with open('../Dataset/ratebeer/{}/test/useritem2sentids_test.json'.format(dataset_name), 'w') as f:
    json.dump(user_item_candidate_sent_ids_testset, f)

In [55]:
review_test_cnt = 0
for user_chunk in user_item_candidate_sent_ids_testset.items():
    user_id = user_chunk[0]
    user_dict = user_chunk[1]
    for user_item_chunk in user_dict.items():
        item_id = user_item_chunk[0]
        candidate_sents = user_item_chunk[0]
        true_label_sents = user_item_chunk[1]
        review_test_cnt += 1
print(review_test_cnt)

40729


In [56]:
check_user_id = "1988"
check_item_id = "202"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 1988 	 item: 202
number of sentence in candidate set: 2248
number of sentence in true review set: 4


In [57]:
# Write useritem2sentids_test into a line-by-line format
with open('../Dataset/ratebeer/{}/test/useritem2sentids_test_multilines.json'.format(dataset_name), 'w') as f1:
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_testset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

print("Total {} users".format(cnt_user))
print("Totat {} reviews".format(cnt_review))

Total 2963 users
Totat 40729 reviews


In [58]:
check_user_id = "1988"
check_item_id = "202"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("[VALID] number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("[VALID] number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))
print("[TEST]  number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("[TEST]  number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 1988 	 item: 202
[VALID] number of sentence in candidate set: 500
[VALID] number of sentence in true review set: 4
[TEST]  number of sentence in candidate set: 2248
[TEST]  number of sentence in true review set: 4


In [63]:
len(trainset_user_to_sent_id[check_user_id])

263

In [64]:
len(trainset_item_to_sent_id[check_item_id])

1985

In [65]:
len(set(trainset_user_to_sent_id[check_user_id]) | set(trainset_item_to_sent_id[check_item_id]))

2248