In [347]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [348]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [349]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(),"lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [350]:
negative_reviews = BeautifulSoup(open('electronics/negative.review').read(),"lxml")
negative_reviews = negative_reviews.findAll('review_text')

In [351]:
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [352]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [353]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

In [354]:
for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            

In [355]:
len(word_index_map)

11088

In [356]:
def tokens_to_vectors(tokens,label):
    x = np.zeros((1,len(word_index_map) + 1))
    for t in tokens:
        i = word_index_map[t]
        x[0][i] += 1
    x[0] = x[0] / x[0].sum()
    x[0][-1] = label
    return x


In [357]:
N = len(positive_tokenized) + len(negative_tokenized)

In [358]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vectors(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vectors(tokens, 0)
    data[i,:] = xy
    i += 1


In [359]:
np.random.shuffle(data)

In [360]:
data.shape

(2000, 11089)

In [361]:
X = data[:,:-1]
Y = data[:,-1]

In [362]:
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [363]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:", model.score(Xtest, Ytest))

Classification rate: 0.71


In [367]:
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


happy 0.667343707814
pretty 0.616615831601
sound 1.07111736227
quality 1.4888183132
price 2.72501873411
've 0.798556896459
ha 0.754260833704
wa -1.42369027984
cable 0.65166191306
picture 0.631014679163
speaker 0.956851007981
doe -1.25358680217
bad -0.796476344304
easy 1.77706319065
unit -0.684524030024
you 0.919938704686
n't -2.17366152363
love 1.18723960014
comfortable 0.645876783046
lot 0.669641965105
tried -0.845646866898
poor -0.705749158037
bit 0.609363433509
time -0.753929782217
then -1.09924572358
highly 0.971227919356
money -0.974279029837
warranty -0.628059697859
card -0.532785010249
home 0.515085892106
little 0.957063241742
using 0.666653815492
space 0.556558749421
buy -0.840089394352
memory 0.985886216481
video 0.519164707149
month -0.665471592923
expected 0.574178475824
item -0.932281639094
hour -0.558600477512
fast 0.857976584521
perfect 1.02728966802
try -0.696827534671
paper 0.652864209776
look 0.605312872576
laptop 0.53274052995
recommend 0.676112205749
company -0.51747