In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


# File Description
This file contains code that can be used to exaimine data in different domains. The code contains:
* Convert previous __int__ ids to __str__ ids so that now all ids in each file is __str__ format
* Check whether those sentence ids in training/testing useritem_candidate_label lies in the corresponding sentence_to_id (or id_to_sentence) mapping

In [28]:
import json
import pandas as pd
import os
import numpy as np

In [4]:
dataset_name = 'medium_30'

# Train Data

In [5]:
dir_path = '../Dataset/ratebeer/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

100000 lines loaded.
Finish loading train dataset, totally 117818 lines.
10000 lines loaded.
Finish loading test dataset, totally 14677 lines.


In [6]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [7]:
train_user_id_set = set(df_train_data['user'].unique())
train_item_id_set = set(df_train_data['item'].unique())

# Sentence2id & id2Sentence

## Train Set

In [9]:
# Sentence2id and id2sentence
with open('../Dataset/ratebeer/{}/train/sentence/sentence2id.json'.format(dataset_name), 'r') as f:
    trainset_sent_to_id = json.load(f)
with open('../Dataset/ratebeer/{}/train/sentence/id2sentence.json'.format(dataset_name), 'r') as f:
    trainset_id_to_sent = json.load(f)

In [10]:
type(trainset_sent_to_id['bottle at home .'])

str

In [11]:
trainset_id_to_sent['0']

'easy to drink .'

In [12]:
len(trainset_id_to_sent)

489352

## Test Set

In [13]:
# Sentence2id and id2sentence
with open('../Dataset/ratebeer/{}/test/sentence/sentence2id.json'.format(dataset_name), 'r') as f:
    testset_sent_to_id = json.load(f)
with open('../Dataset/ratebeer/{}/test/sentence/id2sentence.json'.format(dataset_name), 'r') as f:
    testset_id_to_sent = json.load(f)

In [14]:
testset_id_to_sent['0']

'bottle .'

In [15]:
type(testset_sent_to_id['bottle .'])

str

In [16]:
len(testset_id_to_sent)

64334

## Valid Set

In [17]:
# Sentence2id and id2sentence
with open('../Dataset/ratebeer/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'r') as f:
    validset_sent_to_id = json.load(f)
with open('../Dataset/ratebeer/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'r') as f:
    validset_id_to_sent = json.load(f)

In [18]:
# Valid set sentence-id mapping should be identical to test set sentence-id mapping
assert validset_id_to_sent == testset_id_to_sent
assert validset_sent_to_id == testset_sent_to_id

# Feature

In [19]:
# Feature2id and id2feature
with open('../Dataset/ratebeer/{}/train/feature/feature2id.json'.format(dataset_name), 'r') as f:
    feature_to_id = json.load(f)
with open('../Dataset/ratebeer/{}/train/feature/id2feature.json'.format(dataset_name), 'r') as f:
    id_to_feature = json.load(f)

In [20]:
type(feature_to_id['aroma'])

str

In [22]:
assert len(feature_to_id) == len(id_to_feature)
print("number of feature on dataset {0}: {1}".format(dataset_name, len(feature_to_id)))

number of feature on dataset medium_30: 1000


# User2Feature / Item2Feature / Sentence2Feature

In [23]:
# user2feature
with open('../Dataset/ratebeer/{}/train/user/user2feature.json'.format(dataset_name), 'r') as f:
    user_to_feature = json.load(f)

In [26]:
# item2feature
with open('../Dataset/ratebeer/{}/train/item/item2feature.json'.format(dataset_name), 'r') as f:
    item_to_feature = json.load(f)

In [27]:
# sentence2feature
with open('../Dataset/ratebeer/{}/train/sentence/sentence2feature.json'.format(dataset_name), 'r') as f:
    sentence_to_feature = json.load(f)

In [29]:
user_to_feature_cnt = []
item_to_feature_cnt = []
sentence_to_feature_cnt = []
for key,value in user_to_feature.items():
    user_to_feature_cnt.append(len(value))
for key,value in item_to_feature.items():
    item_to_feature_cnt.append(len(value))
for key,value in sentence_to_feature.items():
    sentence_to_feature_cnt.append(len(value))

print("On dataset: {}".format(dataset_name))
print("number of user on train set: {}".format(len(user_to_feature_cnt)))
print("number of item on train set: {}".format(len(item_to_feature_cnt)))
print("number of sentence on train set: {}".format(len(sentence_to_feature_cnt)))
print("=========================================")
print("max number of feature per user: {}".format(np.max(user_to_feature_cnt)))
print("min number of feature per user: {}".format(np.min(user_to_feature_cnt)))
print("mean number of feature per user: {}".format(np.mean(user_to_feature_cnt)))
print("=========================================")
print("max number of feature per item: {}".format(np.max(item_to_feature_cnt)))
print("min number of feature per item: {}".format(np.min(item_to_feature_cnt)))
print("mean number of feature per item: {}".format(np.mean(item_to_feature_cnt)))
print("=========================================")
print("max number of feature per sentence: {}".format(np.max(sentence_to_feature_cnt)))
print("min number of feature per sentence: {}".format(np.min(sentence_to_feature_cnt)))
print("mean number of feature per sentence: {}".format(np.mean(sentence_to_feature_cnt)))

On dataset: medium_30
number of user on train set: 1664
number of item on train set: 1490
number of sentence on train set: 489352
max number of feature per user: 849
min number of feature per user: 14
mean number of feature per user: 325.80528846153845
max number of feature per item: 760
min number of feature per item: 157
mean number of feature per item: 398.403355704698
max number of feature per sentence: 40
min number of feature per sentence: 1
mean number of feature per sentence: 5.371963331099086


# UserItem2Sentids

### Issue with the training set
Due to the selection of training sentences, we omitted sentence with less than 3 effective tokens. This may lead to some review's sentences are all being omiited which then lead to an __empty__ true label sentence id list.

This issue is now __FIXED__. Data which contains empty true label sentence id list has been removed from the training set (i.e. *useritem_to_sentid* )

In [25]:
empty_label = 0
for user_chunk in list(useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        if len(true_sent_ids) == 0:
            # print(item_chunk)
            empty_label += 1
print(empty_label)

0


In [26]:
for user_chunk in list(useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    # construct a item-level dict
    item_level_dict = dict()
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        candidate_sent_ids_str = list(map(str, candidate_sent_ids))
        true_sent_ids_str = list(map(str, true_sent_ids))
        candidate_true_sent_ids_str = [candidate_sent_ids_str, true_sent_ids_str]
        if len(true_sent_ids_str) > 0:
            # add this into item-level dict
            item_level_dict[item_id_str] = candidate_true_sent_ids_str
    # update user-level dict
    useritem_to_sentids[user_id_str] = item_level_dict


In [27]:
for user_chunk in list(useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        assert isinstance(candidate_sent_ids[0], str)
        assert isinstance(true_sent_ids[0], str)

In [50]:
# # write file
# with open('../Dataset/ratebeer/train/useritem2sentids.json', 'w') as f:
#     json.dump(useritem_to_sentids, f)

In [28]:
len(useritem_to_sentids)

1664

In [29]:
itemset = set()
for user_chunk in list(useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        itemset.add(item_id_str)
print(len(itemset))

1490


# UserItem2SentIDs on Testing set

In [30]:
# user2feature
with open('../Dataset/ratebeer/medium_30/test/useritem2sentids_test.json', 'r') as f:
    testset_useritem_to_sentids = json.load(f)

In [31]:
len(testset_useritem_to_sentids)

1664

In [32]:
empty_label = 0
for user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        if len(true_sent_ids) == 0:
            # print(item_chunk)
            empty_label += 1
print(empty_label)

0


In [33]:
test_itemset = set()
for user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        test_itemset.add(item_id_str)
print(len(test_itemset))

1483


In [34]:
for test_item_id in test_itemset:
    assert test_item_id in itemset

In [35]:
for user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    # construct a item-level dict
    item_level_dict = dict()
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        candidate_sent_ids_str = list(map(str, candidate_sent_ids))
        true_sent_ids_str = list(map(str, true_sent_ids))
        candidate_true_sent_ids_str = [candidate_sent_ids_str, true_sent_ids_str]
        if len(true_sent_ids_str) > 0:
            # add this into item-level dict
            item_level_dict[item_id_str] = candidate_true_sent_ids_str
    # update user-level dict
    testset_useritem_to_sentids[user_id_str] = item_level_dict

In [36]:
for user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = str(user_chunk[0])
    # print(user_id_str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # print(item_id_str)
        # candidate_true_sent_ids = item_chunk[1]
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        assert isinstance(candidate_sent_ids[0], str)
        assert isinstance(true_sent_ids[0], str)

In [61]:
# # write file
# with open('../Dataset/ratebeer/test/useritem2sentids_test.json', 'w') as f:
#     json.dump(testset_useritem_to_sentids, f)

## Check Whether the true sentence ids are in the test id2sentence mapping

In [37]:
len(testset_useritem_to_sentids)

1664

In [38]:
with open('../Dataset/ratebeer/medium_30/test/useritem2sentids_test.json', 'r') as f:
    testset_useritem_to_sentids_load = json.load(f)

In [39]:
testset_useritem_to_sentids == testset_useritem_to_sentids_load

True

In [40]:
true_set_sent_ids = set()
number_of_sentence = 0
for user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = user_chunk[0]
    assert isinstance(user_id_str, str)
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = item_chunk[0]
        assert isinstance(item_id_str, str)
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        assert isinstance(candidate_sent_ids[0], str)
        assert isinstance(true_sent_ids[0], str)
        # check if all the true sent ids lies in the testset_id_to_sent mapping
        for cur_true_sent_id in true_sent_ids:
            # add this sent id into set
            true_set_sent_ids.add(cur_true_sent_id)
            number_of_sentence += 1
            if cur_true_sent_id not in testset_id_to_sent:
                print(cur_true_sent_id)
                print(item_chunk)
            else:
                pass

In [41]:
len(true_set_sent_ids)

64334

In [42]:
number_of_sentence

68066

# Feature Embedding

In [28]:
import pickle
with open('../Dataset/ratebeer/train/feature/featureidembedding.pickle', 'rb') as handle:
    feature_emb = pickle.load(handle)

In [29]:
type(feature_emb)

dict

In [44]:
# extract the embedding vector for each feature word
feature_embed_dict = dict()
for emb_chunk in list(feature_emb.items()):
    feature_id = emb_chunk[0]
    feature_emb_np = emb_chunk[1].tolist()
    feature_embed_dict[feature_id] = feature_emb_np

In [45]:
with open('../Dataset/ratebeer/train/feature/featureid2embedding.json', 'w') as f:
    json.dump(feature_embed_dict, f)

# Split Train/Test data (useritem2senids) into multiple lines

## Train Set

In [2]:
import json
# useritem2sentenceid
with open('../Dataset/ratebeer/train/useritem2sentids.json', 'r') as f:
    trainset_useritem_to_sentids = json.load(f)


In [3]:
len(trainset_useritem_to_sentids)

2963

In [4]:
list(trainset_useritem_to_sentids.items())[0][0]

'1000'

In [5]:
type(list(trainset_useritem_to_sentids.items())[0][1])

dict

In [30]:
# Checking How Many User/Item/Review in the training set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(trainset_useritem_to_sentids.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        assert user_id_str in train_user_id_set
        assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 302344
Total number of user: 2963
Total number of item: 3744


In [34]:
# Write useritem2sentids into a line-by-line format
with open('../Dataset/ratebeer/train/useritem2sentids_multilines.json', 'a') as f1:
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(trainset_useritem_to_sentids.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            gold_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'candidate': candidate_sent_ids, "review": gold_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

print("Total {} users".format(cnt_user))
print("Totat {} reviews".format(cnt_review))

Total 2963 users
Totat 302344 reviews


In [35]:
len(useritem_set)

302344

## Test Set

In [43]:
import json
# useritem2sentenceid
with open('../Dataset/ratebeer/medium_30/test/useritem2sentids_test.json', 'r') as f:
    testset_useritem_to_sentids = json.load(f)

In [44]:
len(testset_useritem_to_sentids)

1664

In [45]:
# Checking How Many User/Item/Review are in the test set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(testset_useritem_to_sentids.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        assert user_id_str in train_user_id_set
        assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 14675
Total number of user: 1664
Total number of item: 1483


In [46]:
# Write useritem2sentids_test into a line-by-line format
with open('../Dataset/ratebeer/medium_30/test/useritem2sentids_test_multilines.json', 'a') as f1:
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(testset_useritem_to_sentids.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

print("Total {} users".format(cnt_user))
print("Totat {} reviews".format(cnt_review))

Total 1664 users
Totat 14675 reviews


In [47]:
len(useritem_set)

14675

# Some Examples

In [118]:
id_to_sent['335048']

'would not ever want to drink again .'

In [119]:
id_to_sent['334999']

'nice spicy taste with some yeast in the finish great beer'

In [120]:
id_to_sent['335017']

'good aroma , had a few sips of anothers beer and liked it .'

In [121]:
id_to_sent['335005']

'i was hoping for more flavor than what i got .'

# User2Sentid and Item2Sentid

In [4]:
# user2sentid
with open('../Dataset/ratebeer/train/user/user2sentids.json', 'r') as f:
    trainset_user_to_sentids = json.load(f)

In [5]:
# item2sentid
with open('../Dataset/ratebeer/train/item/item2sentids.json', 'r') as f:
    trainset_item_to_sentids = json.load(f)

In [6]:
len(list(trainset_user_to_sentids.keys()))

2963

In [7]:
len(list(trainset_item_to_sentids.keys()))

3744

In [10]:
import numpy as np
user_num_sentences_cnt = []
for key,value in trainset_user_to_sentids.items():
    user_num_sentences_cnt.append(len(value))
print("Mean number of sentences per user: {}".format(np.mean(user_num_sentences_cnt)))
print("Max number of sentences per user: {}".format(np.max(user_num_sentences_cnt)))
print("Min number of sentences per user: {}".format(np.min(user_num_sentences_cnt)))

Mean number of sentences per user: 437.58825514681064
Max number of sentences per user: 4398
Min number of sentences per user: 8


In [11]:
import numpy as np
item_num_sentences_cnt = []
for key,value in trainset_item_to_sentids.items():
    item_num_sentences_cnt.append(len(value))
print("Mean number of sentences per item: {}".format(np.mean(item_num_sentences_cnt)))
print("Max number of sentences per item: {}".format(np.max(item_num_sentences_cnt)))
print("Min number of sentences per item: {}".format(np.min(item_num_sentences_cnt)))

Mean number of sentences per item: 355.7083333333333
Max number of sentences per item: 2774
Min number of sentences per item: 49


# UserItemCandidateSentence

In [12]:
# useritem2sentenceid
with open('../Dataset/ratebeer/train/useritem2sentids.json', 'r') as f:
    trainset_useritem_to_sentids = json.load(f)

In [13]:
print("Total number of users: {}".format(len(trainset_useritem_to_sentids)))

Total number of users: 2963


In [15]:
trainset_user_num_sentences_cnt = []        # number of sentences relevant to each user on trainset
trainset_item_num_sentences_cnt = []        # number of sentences relevant to each item on trainset
trainset_review_num_sentences_cnt = []      # number of sentences in true review set in each line of trainset
trainset_candidate_num_sentences_cnt = []   # number of sentecnes in candidate set in each line of trainset

trainset_user_sentids_set = dict()
trainset_item_sentids_set = dict()

cnt_reviews = 0

for user_id, user_chunks in trainset_useritem_to_sentids.items():
    for item_id, user_item_chunk in user_chunks.items():
        candidate_sents = user_item_chunk[0]
        gold_revw_sents = user_item_chunk[1]
        # add number of review sentences
        trainset_review_num_sentences_cnt.append(len(gold_revw_sents))
        # add number of candidate sentences
        trainset_candidate_num_sentences_cnt.append(len(candidate_sents))
        # add candidate sents to user
        if user_id in trainset_user_sentids_set:
            trainset_user_sentids_set[user_id].update(candidate_sents)
        else:
            trainset_user_sentids_set[user_id] = set(candidate_sents)
        # add candidate sents to item
        if item_id in trainset_item_sentids_set:
            trainset_item_sentids_set[item_id].update(candidate_sents)
        else:
            trainset_item_sentids_set[item_id] = set(candidate_sents)
        # count this review
        cnt_reviews += 1
print("Totally {} reviews.".format(cnt_reviews))

Totally 302344 reviews.


In [18]:
for key, value in trainset_user_sentids_set.items():
    trainset_user_num_sentences_cnt.append(len(value))
for key, value in trainset_item_sentids_set.items():
    trainset_item_num_sentences_cnt.append(len(value))

# some statistics
print("Mean number of sentences relevant to each user in trainset: {}".format(np.mean(trainset_user_num_sentences_cnt)))
print("Max number of sentences relevant to each user in trainset: {}".format(np.max(trainset_user_num_sentences_cnt)))
print("Min number of sentences relevant to each user in trainset: {}".format(np.min(trainset_user_num_sentences_cnt)))
print("\n")
print("Mean number of sentences relevant to each item in trainset: {}".format(np.mean(trainset_item_num_sentences_cnt)))
print("Max number of sentences relevant to each item in trainset: {}".format(np.max(trainset_item_num_sentences_cnt)))
print("Min number of sentences relevant to each item in trainset: {}".format(np.min(trainset_item_num_sentences_cnt)))

Mean number of sentences relevant to each user in trainset: 23027.967262909213
Max number of sentences relevant to each user in trainset: 98865
Min number of sentences relevant to each user in trainset: 988


Mean number of sentences relevant to each item in trainset: 20221.916666666668
Max number of sentences relevant to each item in trainset: 60410
Min number of sentences relevant to each item in trainset: 5865


In [19]:
# some statistics
print("Mean number of sentences per candidate-set in trainset: {}".format(np.mean(trainset_candidate_num_sentences_cnt)))
print("Max number of sentences per candidate-set in trainset: {}".format(np.max(trainset_candidate_num_sentences_cnt)))
print("Min number of sentences per candidate-set in trainset: {}".format(np.min(trainset_candidate_num_sentences_cnt)))
print("\n")
print("Mean number of sentences per true review in trainset: {}".format(np.mean(trainset_review_num_sentences_cnt)))
print("Max number of sentences per true review in trainset: {}".format(np.max(trainset_review_num_sentences_cnt)))
print("Min number of sentences per true review in trainset: {}".format(np.min(trainset_review_num_sentences_cnt)))

Mean number of sentences per candidate-set in trainset: 475.1722144312439
Max number of sentences per candidate-set in trainset: 546
Min number of sentences per candidate-set in trainset: 8


Mean number of sentences per true review in trainset: 4.419161617230704
Max number of sentences per true review in trainset: 49
Min number of sentences per true review in trainset: 1
