In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize

In [3]:
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

In [4]:
import string
punct = string.punctuation

In [5]:
tokens = tokenizer("This is a sentence")
print(type(tokens[0].text))

<class 'str'>


# Read Data

In [6]:
dir_path = '../Dataset/ratebeer/large_500'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
Finish loading train dataset, totally 302573 lines.
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
Finish loading test dataset, totally 40730 lines.


## Convert List Data to Pandas Dataframe

In [7]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [8]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
302568,373,3849,9,"this shit is rejected coors light , i m certai..."
302569,1458,3849,10,"being a philly fan , i m supposed to hate anyt..."
302570,269,3849,10,"a good malt liquor , undeniably , but it was l..."
302571,859,3849,2,how this beer is actually better than regular ...


In [9]:
len(df_train_data['user'].unique())

2963

In [10]:
len(df_train_data['item'].unique())

3744

# Compute Sentence TF-IDF

In [11]:
def catDoc(textlist):
    res = []
    for tlist in textlist:
        res.extend(tlist)
    return res

In [12]:
def get_tfidf_embedding(text, feature_word_list):
    """
    :param: text: list, sent_number * word
    :return: 
        vectorizer: 
            vocabulary_: word2id
            get_feature_names(): id2word
        tfidf: array [sent_number, max_word_number]
    """
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count)
    tfidf_weight = tfidf.toarray()
    return vectorizer, tfidf_weight

In [13]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [14]:
def compress_array(a, id2word, vocab):
    """
    :param a: matrix, [N, M], N is document number, M is word number
    :param id2word: word id to word
    :return: 
    """
    d = {}
    # Loop over documents
    for i in range(len(a)):
        d[i] = {}
        # Loop over words
        for j in range(len(a[i])):
            if a[i][j] != 0:
                wid_voc = vocab[id2word[j]]
                d[i][wid_voc] = a[i][j]
    return d

# Load Feature Words

In [15]:
feature_2_id_file = '../Dataset/ratebeer/large_500/train/feature/feature2id.json'
with open(feature_2_id_file, 'r') as f:
    feature_vocab = json.load(f)

In [16]:
len(feature_vocab)

2000

In [17]:
feature_vocab['aroma']

'1'

In [18]:
feature_word_list = list(feature_vocab.keys())
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 2000


# Check Whether there are reviews with no sentence

In [19]:
invalid_data = 0
for idx, row in df_train_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [20]:
print(invalid_data)

0


# Construct Sentence Vocab

In [21]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
for idx, row in df_train_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    # _, tf_score = get_tfidf_embedding(review_sents, feature_word_list)
    tfidf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tfidf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            # check whether this sentence has more than 3 tokens
            tokens = tokenizer(cur_sent)
            cnt_tokens = 0
            for token in tokens:
                if token.text.isdigit() or (token.text in punct):
                    pass
                else:
                    cnt_tokens += 1
            # only sentence with more than 2 effective tokens can be added into the sentence vocab
            if cnt_tokens < 3:
                pass
            else:
                sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 10000 == 0:
        print("Processed {} lines".format(idx+1))
print('Finish.')

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Processed 110000 lines
Processed 120000 lines
Processed 130000 lines
Processed 140000 lines
Processed 150000 lines
Processed 160000 lines
Processed 170000 lines
Processed 180000 lines
Processed 190000 lines
Processed 200000 lines
Processed 210000 lines
Processed 220000 lines
Processed 230000 lines
Processed 240000 lines
Processed 250000 lines
Processed 260000 lines
Processed 270000 lines
Processed 280000 lines
Processed 290000 lines
Processed 300000 lines
Finish.


In [22]:
len(sentence_count)

1246458

In [23]:
sentence_with_no_feature

3696

In [24]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [25]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
id_to_sent = {str(id): sent for (sent, id) in sent_to_id.items()}

In [26]:
len(sent_to_id)

1246458

In [27]:
len(id_to_sent)

1246458

In [28]:
id_to_sent['42']

'nice bitter finish .'

## Save Sentence to ID into Json file

In [29]:
with open('../Dataset/ratebeer/large_500/train/sentence/id2sentence.json', 'w') as f:
    json.dump(id_to_sent, f)

In [30]:
with open('../Dataset/ratebeer/large_500/train/sentence/sentence2id.json', 'w') as f:
    json.dump(sent_to_id, f)

# Get Sentence Feature

In [31]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
302568,373,3849,9,"this shit is rejected coors light , i m certai..."
302569,1458,3849,10,"being a philly fan , i m supposed to hate anyt..."
302570,269,3849,10,"a good malt liquor , undeniably , but it was l..."
302571,859,3849,2,how this beer is actually better than regular ...


In [32]:
def check_vocab_is_same(sklearn_vocab, feature_vocab):
    if len(sklearn_vocab) == len(feature_vocab):
        for key, value in sklearn_vocab.items():
            sklearn_vocab_id = value
            feature_vocab_id = feature_vocab[key]
            if int(feature_vocab_id) == sklearn_vocab_id:
                continue
            else:
                return False
    else:
        return False
    return True

In [33]:
sent_to_id['sampled from bottle .']

'80'

In [34]:
len(sent_to_id)

1246458

In [35]:
sentence_text_list = list(sent_to_id.keys())

In [36]:
sentence_text_list[:10]

['bottle at home .',
 'easy to drink .',
 'small white head .',
 'very good beer .',
 'very nice beer .',
 'alcohol is well hidden .',
 'on tap at the brewery .',
 'lots of carbonation .',
 'poured from bottle .',
 'taste is the same as aroma .']

In [37]:
cntvector, tfidf_weight = get_tfidf_embedding(sentence_text_list, feature_word_list)

In [38]:
tfidf_weight.shape

(1246458, 2000)

In [39]:
check_vocab_is_same(cntvector.vocabulary_, feature_vocab)

True

In [40]:
sentence_to_feature = dict()
sentence_with_no_feature = 0
tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
for i in range(len(sentence_text_list)):
    cur_sent = sentence_text_list[i]
    # if this sentence is in the sent_to_id vocabulary
    assert cur_sent in sent_to_id
    # get the sentence_id (str)
    cur_sent_id = sent_to_id[cur_sent]
    assert int(cur_sent_id) == i
    # find all the feature that has non-zero tf-idf weight
    feature_dict = dict()
    for j in range(len(tfidf_weight[i])):
        if tfidf_weight[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight[i][j]
            feature_dict[feature_id] = feature_tfidf
    if len(feature_dict) > 0:
        sentence_to_feature[cur_sent_id] = feature_dict
    else:
        sentence_with_no_feature += 1
    if (i+1) % 50000 == 0:
        print("Processed {} lines".format(i+1))
print("Finish. Totally {} lines".format(i+1))
print("Totally {} sentences has at least 1 feature and {} sentences don't have feature.".format(
    len(sentence_to_feature), sentence_with_no_feature))

Processed 50000 lines
Processed 100000 lines
Processed 150000 lines
Processed 200000 lines
Processed 250000 lines
Processed 300000 lines
Processed 350000 lines
Processed 400000 lines
Processed 450000 lines
Processed 500000 lines
Processed 550000 lines
Processed 600000 lines
Processed 650000 lines
Processed 700000 lines
Processed 750000 lines
Processed 800000 lines
Processed 850000 lines
Processed 900000 lines
Processed 950000 lines
Processed 1000000 lines
Processed 1050000 lines
Processed 1100000 lines
Processed 1150000 lines
Processed 1200000 lines
Finish. Totally 1246458 lines
Totally 1246458 sentences has at least 1 feature and 0 sentences don't have feature.


In [None]:
# # sentence to feature
# sentence_to_feature = dict()
# sentence_with_no_feature = 0
# # Loop for each review
# for idx, row in df_train_data.iterrows():
#     review_text = row['review']
#     review_sents = sent_tokenize(review_text)
#     cntvector, tfidf_weight = get_tfidf_embedding(review_sents, feature_word_list)
#     # assert cntvector.vocabulary_ == feature_vocab
#     assert check_vocab_is_same(cntvector.vocabulary_, feature_vocab)
#     # print(tfidf_weight)
#     tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
#     for i in range(len(review_sents)):
#         cur_sent = review_sents[i]
#         # if this sentence is in the sent_to_id vocabulary
#         if cur_sent in sent_to_id:
#             # get the sentence_id
#             cur_sent_id = sent_to_id[cur_sent]
#             # find all the feature that has no-zero tf-idf weight
#             feature_dict = dict()
#             for j in range(len(tfidf_weight[i])):
#                 if tfidf_weight[i][j] != 0.0:
#                     # get the feature
#                     feature_id = j
#                     feature = feature_word_list[j]
#                     feature_tfidf = tfidf_weight[i][j]
#                     feature_dict[feature_id] = feature_tfidf
#             if len(feature_dict) > 0:
#                 sentence_to_feature[cur_sent_id] = feature_dict
#             else:
#                 sentence_with_no_feature += 1

#     if (idx+1) % 10000 == 0:
#         print("Processed {} lines".format(idx+1))


# Save Sentence2Feature into Json File

In [41]:
with open('../Dataset/ratebeer/large_500/train/sentence/sentence2feature.json', 'w') as f:
    json.dump(sentence_to_feature, f)

In [42]:
with open('../Dataset/ratebeer/large_500/train/sentence/sentence2feature.json', 'r') as f:
    sentence_to_feature = json.load(f)

In [43]:
sentence_to_feature['0']

{'10': 0.4639189217945095, '486': 0.8858776631121363}

In [46]:
sentence_to_feature['1246457']

{'195': 0.6551843674842948, '315': 0.7554690229282766}

In [47]:
len(sentence_to_feature)

1246458

In [48]:
num_feature_per_sentence = []
for key, value in sentence_to_feature.items():
    num_feature_per_sentence.append(len(value))
    assert len(value) > 0

In [49]:
print("Mean number of features per sentence: {}".format(np.mean(num_feature_per_sentence)))
print("Max number of features per sentence: {}".format(np.max(num_feature_per_sentence)))
print("Min number of features per sentence: {}".format(np.min(num_feature_per_sentence)))

Mean number of features per sentence: 5.817189989554401
Max number of features per sentence: 44
Min number of features per sentence: 1


# Get User to Feature

In [50]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
302568,373,3849,9,"this shit is rejected coors light , i m certai..."
302569,1458,3849,10,"being a philly fan , i m supposed to hate anyt..."
302570,269,3849,10,"a good malt liquor , undeniably , but it was l..."
302571,859,3849,2,how this beer is actually better than regular ...


## GroupBy User

In [51]:
group_by_user = df_train_data.groupby('user')
user_id_list = []
user_reviews = []
# Loop over all user
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_df = user_df_chunk[1]
    user_text = " ".join(list(user_df['review']))
    user_id_list.append(user_id)
    user_reviews.append(user_text)


In [52]:
len(user_id_list)

2963

In [53]:
len(user_reviews)

2963

In [54]:
len(feature_word_list)

2000

## Compute User TF-IDF

In [55]:
cntvector_user, tfidf_weight_user = get_tfidf_embedding(user_reviews, feature_word_list)

In [56]:
# assert cntvector_user.vocabulary_ == feature_vocab
check_vocab_is_same(cntvector_user.vocabulary_, feature_vocab)

True

In [57]:
tfidf_weight_user.shape

(2963, 2000)

In [58]:
user_to_feature = dict()
for i in range(len(user_id_list)):
    feature_dict = dict()
    cur_user_id = user_id_list[i]
    for j in range(len(tfidf_weight_user[i])):
        if tfidf_weight_user[i][j] != 0.0:
            # get the feature
            # NOTE: make sure that the feature_id is str format
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight_user[i][j]
            feature_dict[feature_id] = feature_tfidf
    assert len(feature_dict) > 0
    user_to_feature[str(cur_user_id)] = feature_dict
    if (i+1) % 500 == 0:
        print("{} user processed.".format(i+1))
print("Totally {} users".format(i+1))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
Totally 2963 users


In [59]:
len(user_to_feature)

2963

In [60]:
num_feature_per_user = []
for key,value in user_to_feature.items():
    num_feature_per_user.append(len(value))
    assert len(value) > 0

In [61]:
print("Mean number of features per user: {}".format(np.mean(num_feature_per_user)))
print("Max number of features per user: {}".format(np.max(num_feature_per_user)))
print("Min number of features per user: {}".format(np.min(num_feature_per_user)))

Mean number of features per user: 436.17448531893353
Max number of features per user: 1481
Min number of features per user: 12


## Save User to Feature Mapping into Json File

In [62]:
with open('../Dataset/ratebeer/large_500/train/user/user2feature.json', 'w') as f:
    json.dump(user_to_feature, f)

In [63]:
with open('../Dataset/ratebeer/large_500/train/user/user2feature.json', 'r') as f:
    user_to_feature = json.load(f)

# Get Item to Feature

## GroupBy Item

In [64]:
group_by_item = df_train_data.groupby('item')
item_id_list = []
item_reviews = []
# Loop over all user
for item_df_chunk in list(group_by_item):
    item_id = str(item_df_chunk[0])
    item_df = item_df_chunk[1]
    item_text = " ".join(list(item_df['review']))
    item_id_list.append(item_id)
    item_reviews.append(item_text)


In [65]:
len(item_id_list)

3744

In [66]:
len(item_reviews)

3744

In [67]:
len(df_train_data['item'].unique())

3744

## Compute Item TF-IDF

In [68]:
cntvector_item, tfidf_weight_item = get_tfidf_embedding(item_reviews, feature_word_list)

In [69]:
# assert cntvector_item.vocabulary_ == feature_vocab
check_vocab_is_same(cntvector_item.vocabulary_, feature_vocab)

True

In [70]:
tfidf_weight_item.shape

(3744, 2000)

In [71]:
item_to_feature = dict()
for i in range(len(item_id_list)):
    feature_dict = dict()
    cur_item_id = item_id_list[i]
    for j in range(len(tfidf_weight_item[i])):
        if tfidf_weight_item[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            assert feature_id == feature_vocab[feature]
            feature_tfidf = tfidf_weight_item[i][j]
            feature_dict[feature_id] = feature_tfidf
    assert len(feature_dict) > 0
    item_to_feature[cur_item_id] = feature_dict
    if (i+1) % 500 == 0:
        print("{} items processed.".format(i+1))
print('Finish. Totally {} items'.format(i+1))

500 items processed.
1000 items processed.
1500 items processed.
2000 items processed.
2500 items processed.
3000 items processed.
3500 items processed.
Finish. Totally 3744 items


In [72]:
len(item_to_feature)

3744

In [73]:
num_feature_per_item = []
for key,value in item_to_feature.items():
    num_feature_per_item.append(len(value))
    assert len(value) > 0

In [74]:
print("Mean number of features per item: {}".format(np.mean(num_feature_per_item)))
print("Max number of features per item: {}".format(np.max(num_feature_per_item)))
print("Min number of features per item: {}".format(np.min(num_feature_per_item)))

Mean number of features per item: 461.50267094017096
Max number of features per item: 1391
Min number of features per item: 100


## Save Item to Feature Mapping into Json File

In [75]:
with open('../Dataset/ratebeer/large_500/train/item/item2feature.json', 'w') as f:
    json.dump(item_to_feature, f)