In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import spacy
import nltk
import re
import json
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

# Read Data

In [3]:
dir_path = '../Dataset/ratebeer/large_500'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
60000 lines loaded.
70000 lines loaded.
80000 lines loaded.
90000 lines loaded.
100000 lines loaded.
110000 lines loaded.
120000 lines loaded.
130000 lines loaded.
140000 lines loaded.
150000 lines loaded.
160000 lines loaded.
170000 lines loaded.
180000 lines loaded.
190000 lines loaded.
200000 lines loaded.
210000 lines loaded.
220000 lines loaded.
230000 lines loaded.
240000 lines loaded.
250000 lines loaded.
260000 lines loaded.
270000 lines loaded.
280000 lines loaded.
290000 lines loaded.
300000 lines loaded.
Finish loading train dataset, totally 302573 lines.
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
Finish loading test dataset, totally 40730 lines.


## Convert List Data to Pandas Dataframe

In [4]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [5]:
df_train_data

Unnamed: 0,item,user,rating,review
0,196,999,17,dark brown body with a light brown head . nutt...
1,1149,999,14,hazy orange / gold body is topped by a medium ...
2,236,999,16,12 oz bottle thanks to acknud . pours much dar...
3,634,999,15,clear and radiant mahogany body with a small b...
4,490,999,16,nice looker with tons of spiderweb lacing . bo...
...,...,...,...,...
302568,373,3849,9,"this shit is rejected coors light , i m certai..."
302569,1458,3849,10,"being a philly fan , i m supposed to hate anyt..."
302570,269,3849,10,"a good malt liquor , undeniably , but it was l..."
302571,859,3849,2,how this beer is actually better than regular ...


In [6]:
print("number of user in trainset: {}".format(len(list(df_train_data['user'].unique()))))
print("number of item in trainset: {}".format(len(list(df_train_data['item'].unique()))))

number of user in trainset: 2963
number of item in trainset: 3744


In [7]:
def catDoc(textlist):
    res = []
    for tlist in textlist:
        res.extend(tlist)
    return res

In [8]:
def calTFidf(text):
    # FIXED: Omit STOP Words
    vectorizer = CountVectorizer(lowercase=True, stop_words='english')
    wordcount = vectorizer.fit_transform(text)

    print("wordcount: ", wordcount.shape)
    tf_idf_transformer = TfidfTransformer()
    tfidf_matrix = tf_idf_transformer.fit_transform(wordcount)
    return vectorizer, tfidf_matrix

# Load Reviews

In [9]:
documents = []
for idx, row in df_train_data.iterrows():
    text = row['review']
    documents.append(text)

In [10]:
len(documents)

302573

# Compute TF-IDF Matrix

In [11]:
vectorizer, tfidf_matrix = calTFidf(documents)

wordcount:  (302573, 87900)


In [12]:
print("The number of example is {0}, and the TFIDF vocabulary size is {1}".format(
    len(documents), len(vectorizer.vocabulary_)))

The number of example is 302573, and the TFIDF vocabulary size is 87900


## Compute Mean TF-IDF score for each word

In [13]:
word_tfidf = np.array(tfidf_matrix.mean(0))

In [14]:
word_tfidf.shape

(1, 87900)

In [25]:
word_tfidf[0][:20]

array([1.60342261e-04, 2.92155393e-05, 2.13455265e-06, 8.52083795e-07,
       8.94606634e-07, 1.54518204e-06, 2.40607564e-06, 3.45788165e-05,
       1.36548918e-06, 9.05476767e-07, 4.34640838e-05, 1.41078352e-06,
       4.22942232e-05, 6.80602743e-07, 1.48317784e-05, 3.72549987e-07,
       1.48041600e-06, 1.93395367e-06, 7.45418140e-07, 1.44921904e-06])

## Sort Words According to Their Mean TF-IDF value

In [15]:
word_order = np.argsort(-word_tfidf[0])

In [16]:
word_order.shape

(87900,)

In [17]:
word_order[:20]

array([37807,  6347,  9120, 31742, 76268, 46627, 48780, 53941, 77191,
       39499, 11956, 85685, 31214, 61061, 21996, 35609, 13064, 14720,
       18539, 10532])

In [18]:
import string
punct = string.punctuation

In [19]:
vocab2id = {}
id2vocab = {}

id2word = vectorizer.get_feature_names()
feature_word_num = 2000
feature_word_idx = 0

cnt = 0
for idx in word_order:
    # get the word
    word = id2word[idx]
    # check if this word is a number/punctuation
    if word.isdigit() or (word in punct):
        print(word)
    else:
        # write word into vocab2id mapping
        vocab2id[word] = str(len(vocab2id))
        # write word into id2vocab mapping
        id2vocab[str(cnt)] = word
        cnt += 1
        if cnt == feature_word_num:
            break

12
22
2008
2011
500
2009
10
750
2010
2007
11
2006
33
2005
50
330
2004
08
2003
07
25
09
16
20
06
15
30
13
18
341
14
24
05
17
21
99
27
23
355
28
75
19
29
04
03
26
650
02
31
375
01
2002
100
40
90


## Save the feature vocabulary to json file

In [20]:
len(vocab2id)

2000

In [21]:
len(id2vocab)

2000

In [22]:
with open('../Dataset/ratebeer/large_500/train/feature/feature2id.json', 'w') as f:
    json.dump(vocab2id, f)

In [23]:
with open('../Dataset/ratebeer/large_500/train/feature/id2feature.json', 'w') as f:
    json.dump(id2vocab, f)

In [24]:
key_cnt = 0
stack_key = []
for key, value in vocab2id.items():
    if key_cnt == 10:
        # write this into file
        with open('../Dataset/ratebeer/large_500/train/feature/features.txt', 'a') as f:
            key_text = '\t'.join(stack_key)
            f.write(key_text)
            f.write('\n')
        key_cnt = 0
        stack_key = []
    stack_key.append(key)
    key_cnt += 1

if len(stack_key) != 0:
    with open('../Dataset/ratebeer/large_500/train/feature/features.txt', 'a') as f:
        key_text = '\t'.join(stack_key)
        f.write(key_text)
        f.write('\n')