In [21]:
import numpy as np
import gensim
import re
import os
import nltk
from sklearn.cluster import KMeans
from functools import reduce
from build_sentence_corp import extract_sent
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import random
import pickle
import string
import json

## Word2Vec

In [25]:
embedding_vector_size = 300
embedding_path = 'models/word2vec/GoogleNews-vectors-negative300.bin'
embedding = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=True)

In [27]:
def generate_vocab(sentence_dir):
    table = str.maketrans('','', string.punctuation)
    vocab_count = {}
    for sent in sentence_dir:
        for word in sent.split():
            word = word.lower().translate(table).strip()
            if word in vocab_count:
                vocab_count[word] += 1
            else:
                vocab_count[word] = 1
        
    vocab = set()
    k = 10    
    for key, val in vocab_count.items():
        if val >= k and key not in stopwords.words('english'):
            vocab.add(key)
    
    return vocab

In [28]:
def extract_sent(path, regex):
    review_dir = []
    for file in os.listdir(path):
        with open(path+file) as f:
            txt = f.read()
            reviews = re.findall(regex, txt)
        review_dir += reviews
    
    sentence_dir = []
    for review in review_dir:
        sentences = nltk.sent_tokenize(review)
        sentence_dir += sentences
        
    return sentence_dir

In [29]:
def initializeT(words):
    matrix = np.empty((0, embedding_vector_size))
    for w in words:
        try:
            matrix = np.vstack((matrix, embedding[w]))
        except:
            pass
    print(matrix.shape)
    kmeans = KMeans(n_clusters=14, random_state=0).fit(matrix)
    return kmeans


In [6]:
text_path = './data/TripAdvisor/Texts/'
regex = r"<Content>(.*)\n<Date>" 
sentence_dir = extract_sent(text_path, regex)

## Amazon  Reviews

In [7]:
path = 'data/AmazonReviews/laptops/'
reviews = []
for file in os.listdir(path):
    with open(path+file) as f:
        data = json.load(f)
        for review in data['Reviews']:
            reviews.append(review['Content'])

In [15]:
reviews[0]

'I\'ve had the S7-391 with 4Gb RAM and a 256 GB SSD for about 6 months, having paid a lot more for it when it first came out. I can recommend it, but only if you understand the limitations and problems with the machine. Here\'s a quick summary of my experience to date:GoodIt\'s very thin and very light. The screen has excellent resolution and reasonably responsive touch capabilities. Since it turns itself on when you open it, you need to actually shut it down to (carefully) clean the screen. it uses an SSD and never shuts itself off unless you force it to shut down, it comes up quite fast and is ready to go. It\'s even fast on a cold boot, like most SSD systems.That\'s mostly it for the good.Not GoodUntil recently, the WiFi would just stop connecting and I\'d need to manually reconnect to my router or an AP at work. Bluetooth would also get wiggy and stop communicating, particularly after a modest pause in activity. This was such a regular "feature" with the included BT mouse that I ha

## Tokenize

In [17]:
sentence_dir = []
for review in reviews:
    if review:
        sentences = nltk.sent_tokenize(review)
        sentence_dir += sentences

In [18]:
len(sentence_dir)

323984

In [30]:
words = generate_vocab(sentence_dir)

In [31]:
len(words)

11352

In [32]:
matrix = np.empty((0, embedding_vector_size))
for w in words:
    try:
        matrix = np.vstack((matrix, embedding[w]))
    except:
        pass
print(matrix.shape)

(9300, 300)


In [33]:
a = open('amazon_laptop_matrix.pickle', 'wb')
pickle.dump(matrix, a)
a.close()


In [34]:
kmeans = initializeT(words)

(9300, 300)


In [76]:
pickle_T = open('T_matrix.pickle', 'wb')
pickle.dump(kmeans.cluster_centers_, pickle_T)
pickle_T.close()

In [78]:
pickle_in = open('T_matrix.pickle', 'rb')
centers = pickle.load(pickle_in)