In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize

In [3]:
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

In [4]:
import string
punct = string.punctuation

# Load Data

## Load Dataset

In [5]:
dir_path = '../Dataset/ratebeer/large_500'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
Finish loading train dataset, totally 302573 lines.
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
Finish loading test dataset, totally 40730 lines.


In [6]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [7]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
302568,373,3849,9,"this shit is rejected coors light , i m certai..."
302569,1458,3849,10,"being a philly fan , i m supposed to hate anyt..."
302570,269,3849,10,"a good malt liquor , undeniably , but it was l..."
302571,859,3849,2,how this beer is actually better than regular ...


In [8]:
# groupby multiple columns
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt = 0
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    if len(cur_df_user_item) > 1:
        if cnt <= 10:
            print(cur_df_user_item)
        cnt += 1
print("{} data instance are the same".format(cnt))
# make sure that there are no duplicated reviews

0 data instance are the same


## Load Sentence2ID and ID2Sentence Mapping

In [9]:
with open('../Dataset/ratebeer/large_500/train/sentence/sentence2id.json', 'r') as f:
    sent_to_id = json.load(f)

In [10]:
type(sent_to_id['bottle at home .'])
# id are stored as str (not int)

str

In [11]:
with open('../Dataset/ratebeer/large_500/train/sentence/id2sentence.json', 'r') as f:
    id_to_sent = json.load(f)

In [12]:
id_to_sent['0']

'bottle at home .'

In [13]:
assert len(sent_to_id) == len(id_to_sent)

In [14]:
len(sent_to_id)

1246458

## Load Feature Words

In [15]:
feature_2_id_file = '../Dataset/ratebeer/large_500/train/feature/feature2id.json'
with open(feature_2_id_file, 'r') as f:
    feature_vocab = json.load(f)

In [16]:
len(feature_vocab)

2000

In [17]:
feature_vocab['aroma']
# feature id is stored as str (not int)

'1'

## Load Sentence2Feature

In [18]:
with open('../Dataset/ratebeer/large_500/train/sentence/sentence2feature.json', 'r') as f:
    sentence_to_feature = json.load(f)

In [19]:
sentence_to_feature['0']

{'10': 0.4639189217945095, '486': 0.8858776631121363}

In [20]:
assert len(sentence_to_feature) == len(sent_to_id)
len(sentence_to_feature)

1246458

# Construct User-Item Pair

## GroupBy User

In [21]:
group_by_user = df_train_data.groupby('user')

In [22]:
group_by_user_dict = dict(tuple(group_by_user))

In [23]:
group_by_user_dict['1223']

Unnamed: 0,item,user,rating,review
55001,222,1223,11,"lo mismo que el aleman , esperaba mas bottle :..."
55002,923,1223,1,tipica cerveza latina que hay que tomarse rapi...
55003,1078,1223,12,"para ser un pale ale normal , bastante mejor q..."
55004,987,1223,1,this beer gives me so much gas that i had to m...
55005,1055,1223,15,"la birria parecida , salada y hoppy"
...,...,...,...,...
55089,849,1223,13,imperial stouts are the shiznit . every day i ...
55090,1148,1223,12,"en pike brewery con sici , parte de un sampler..."
55091,2275,1223,14,bastante hoppy
55092,494,1223,12,"solo es black < , no muy ale ni cigar . intrer..."


In [24]:
len(group_by_user_dict)

2963

In [25]:
user_id_list = list(df_train_data['user'].unique())

In [26]:
print(len(user_id_list))

2963


In [27]:
user_to_sent = dict()
cnt_sentence_with_no_feature = 0

for user_id in user_id_list:
    # note this user_id is a str
    # get the dataframe for this user
    user_df = group_by_user_dict[user_id]
    user_reviews = list(user_df['review'])
    user_sent_ids = set()
    for review in user_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                # assert cur_sent_id in sentence_to_feature
                # user_sent_ids.add(cur_sent_id)
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    user_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    user_to_sent[user_id] = user_sent_ids
print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [28]:
len(user_to_sent)

2963

In [29]:
user_to_sentids = dict()
for user_id, user_sents in user_to_sent.items():
    assert len(user_sents) > 0
    assert isinstance(user_id, str)
    assert isinstance(list(user_sents)[0], str)
    user_to_sentids[user_id] = list(user_sents)

In [30]:
len(user_to_sentids)

2963

In [31]:
with open('../Dataset/ratebeer/large_500/train/user/user2sentids.json', 'w') as f:
    json.dump(user_to_sentids, f)

## GroupBy Item

In [32]:
group_by_item = df_train_data.groupby('item')
group_by_item_dict = dict(tuple(group_by_item))

In [33]:
group_by_item_dict['256']

Unnamed: 0,item,user,rating,review
136,256,999,15,12 oz bottle pours a hazy copper body topped b...
1968,256,1892,14,"on tap @ the "" sarasota rhythm & brews festiva..."
2794,256,677,15,hazy copper pour with a creamy head . the nose...
4028,256,610,12,a decent flavored pale ale but not much body t...
4542,256,1381,15,deep gold color with hints of orange and a whi...
...,...,...,...,...
298708,256,2648,14,bottle . copper / amber pour with white head ....
298720,256,3356,13,"aroma : roasted grain , fruity - - pear , peac..."
300472,256,3380,10,"a little skunk aroma when opened , nice color ..."
301398,256,2605,15,a good pale ale . it has a nice light malt aro...


In [34]:
len(group_by_item_dict)

3744

In [35]:
item_id_list = list(df_train_data['item'].unique())
item_to_sent = dict()
cnt_sentence_with_no_feature = 0
for item_id in item_id_list:
    # note this item_id is a str
    # get the dataframe for this item
    assert isinstance(item_id, str)
    item_df = group_by_item_dict[item_id]
    item_reviews = list(item_df['review'])
    item_sent_ids = set()
    for review in item_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    item_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    item_to_sent[item_id] = item_sent_ids

print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [36]:
len(item_to_sent)

3744

In [37]:
item_to_sentids = dict()
for item_id, item_sents in item_to_sent.items():
    assert len(item_sents) > 0
    assert isinstance(list(item_sents)[0], str)
    item_to_sentids[item_id] = list(item_sents)

In [38]:
len(item_to_sentids)

3744

In [39]:
with open('../Dataset/ratebeer/large_500/train/item/item2sentids.json', 'w') as f:
    json.dump(item_to_sentids, f)

# For Each Data Instance in Trainset

In [40]:
import random
sample_sent_num = 500
user_item_candidate_sent_ids = dict()
# Loop over all User
user_cnt = 0
review_cnt = 0
review_with_no_selectd_label_sentence = 0
useable_review_cnt = 0
sentence_with_no_feature_cnt = 0
sentence_not_tracked = set()
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    # get user sents
    cur_user_sent_ids = user_to_sent[user_id_str]
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        review_text = row['review']
        review_cnt += 1
        # get item sents
        cur_item_sent_ids = item_to_sent[item_id_str]
        # get review_text's sent ids
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of current review
                    cur_review_sent_ids.add(cur_sent_id)
                else:
                    sentence_with_no_feature_cnt += 1
            else:
                # if this sentence is not being tracked by the sentence-id mapping
                # we add it into this set to see how many sentences are being ignored
                sentence_not_tracked.add(sent)
        ## check whether the true label of the sentence is an empty list of sent_ids
        if len(cur_review_sent_ids) == 0:
            review_with_no_selectd_label_sentence += 1
        else:
            # construct the candidate set which is an union of user sentences and item sentences
            cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
            # sample some sentences
            if len(cur_useritem_sent_ids) > sample_sent_num:
                sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            else:
                # FIXED: 
                # IMPORTANT: Here is a bug!!!
                # sample_useritem_sent_ids = cur_user_sent_ids
                sample_useritem_sent_ids = cur_useritem_sent_ids
            # union sampled sentences with true labeled sentences
            final_useritem_sent_ids = sample_useritem_sent_ids | cur_review_sent_ids
            # add this into the dict
            item_candidate_sent_ids[item_id_str] = [list(final_useritem_sent_ids), list(cur_review_sent_ids)]
            # add useable review cnt
            useable_review_cnt += 1
    # add the item_candidate_sent_ids dict into the user-level dict
    user_item_candidate_sent_ids[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 200 == 0:
        print("{} user processed".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, review_with_no_selectd_label_sentence))
print("{} sentences has 0 feature".format(sentence_with_no_feature_cnt))
print("{} sentences are not being tracked in the sent2id mapping".format(len(sentence_not_tracked)))


200 user processed
400 user processed
600 user processed
800 user processed
1000 user processed
1200 user processed
1400 user processed
1600 user processed
1800 user processed
2000 user processed
2200 user processed
2400 user processed
2600 user processed
2800 user processed
Finish.
Totally 2963 users
Totally 302573 reviews. Among them 234 reviews has empty true label sentence
0 sentences has 0 feature
14774 sentences are not being tracked in the sent2id mapping


In [41]:
len(user_item_candidate_sent_ids)
# there are 2968 users

2963

In [42]:
# NOTE: Since there are still some duplicated reviews
# let's check how many unique reviews are there

cnt_unique_reviews = 0
cnt_empty_true_sent = 0
sentence_per_review = []
# [user-level] Loop for each user
for user_chunk in list(user_item_candidate_sent_ids.items()):
    user_id_str = str(user_chunk[0])
    # assert isinstance(user_chunk[0], str)
    # [item-level] Loop for each user-item pair
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # assert isinstance(item_chunk[0], str)
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        if len(true_sent_ids) == 0:
            cnt_empty_true_sent += 1
        else:
            assert isinstance(candidate_sent_ids[0], str)
            assert isinstance(true_sent_ids[0], str)
            sentence_per_review.append(len(true_sent_ids))
        cnt_unique_reviews += 1

print("Total number of unique selected reviews: {}".format(cnt_unique_reviews))
print("Total number of review with empty true sentences: {}".format(cnt_empty_true_sent))
print("Total number of unique review with non-empty true sentences: {}".format(
    cnt_unique_reviews - cnt_empty_true_sent))

Total number of unique selected reviews: 302339
Total number of review with empty true sentences: 0
Total number of unique review with non-empty true sentences: 302339


In [43]:
import numpy as np
print("Totally {} user-item pairs in the trainset".format(len(sentence_per_review)))
print("max number of true sentence per review: {}".format(np.max(sentence_per_review)))
print("min number of true sentence per review: {}".format(np.min(sentence_per_review)))
print("mean number of true sentence per review: {}".format(np.mean(sentence_per_review)))

Totally 302339 user-item pairs in the trainset
max number of true sentence per review: 48
min number of true sentence per review: 1
mean number of true sentence per review: 4.417336169002345


In [44]:
with open('../Dataset/ratebeer/large_500/train/useritem2sentids.json', 'w') as f:
    json.dump(user_item_candidate_sent_ids, f)

In [45]:
len(user_item_candidate_sent_ids['1892']['256'][0])

503

In [46]:
len(user_to_sent['1892'])

308

In [47]:
len(item_to_sent['256'])

1510

In [48]:
len(set(user_to_sent['1892']) | set(item_to_sent['256']))

1813

In [49]:
len(user_item_candidate_sent_ids['1892']['256'][1])

5