In [1]:
import nltk

In [29]:
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
import numpy as np

In [3]:
from nltk.stem import WordNetLemmatizer

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
from bs4 import BeautifulSoup

In [7]:
wordenet_lem = WordNetLemmatizer()

In [8]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [17]:
pos_rev = BeautifulSoup(open('/positive.review').read())

In [18]:
pos_rev = pos_rev.findAll('review_text')

In [19]:
neg_rev = BeautifulSoup(open('/negative.review').read())

In [20]:
neg_rev = neg_rev.findAll('review_text')

In [21]:
np.random.shuffle(pos_rev)

In [22]:
pos_rev = pos_rev[:len(neg_rev)]

In [31]:
word_index_map = {}
cur_index = 0

In [24]:
def my_tokenizer(s):
  s = s.lower()
  tokens = nltk.tokenize.word_tokenize(s)
  tokens = [t for t in tokens if len(t)>2]
  tokens = [wordenet_lem.lemmatize(t) for t in tokens]
  tokens = [t for t in tokens if t not in stopwords]
  return tokens

In [33]:
pos_tokenized = []
neg_tokenized = []

In [34]:
for review in pos_rev:
  tokens = my_tokenizer(review.text)
  pos_tokenized.append(tokens)
  for token in tokens:
    if token not in word_index_map:
      word_index_map[token] = cur_index
      cur_index+=1

In [35]:
for review in neg_rev:
  tokens = my_tokenizer(review.text)
  neg_tokenized.append(tokens)
  for token in tokens:
    if token not in word_index_map:
      word_index_map[token] = cur_index
      cur_index+=1

In [53]:
def tokens_to_vector(tokens, label):
  x = np.zeros(len(word_index_map)+1)
  if not tokens: # Handle cases where tokens list is empty
      x[-1] = label
      return x
  for t in tokens:
    i = word_index_map[t]
    x[i]+=1
  x = x/x.sum()
  x[-1] = label
  return x

In [37]:
N = len(pos_tokenized) + len(neg_tokenized)


In [38]:
N

2000

In [54]:
data = np.zeros((N, len(word_index_map)+1))
i = 0
for tokens in pos_tokenized:
  xy = tokens_to_vector(tokens,1)
  data[i:] = xy
  i+=1

for tokens in neg_tokenized:
  xy = tokens_to_vector(tokens,0)
  data[i:] = xy
  i+=1

np.random.shuffle(data)
X = data[:, :-1]
Y = data[:, -1]
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [57]:
model = LogisticRegression()

In [58]:
model.fit(Xtrain,Ytrain)

In [59]:
model.score(Xtest, Ytest)

0.77

In [64]:
threshold = 0.5
for word,index in word_index_map.items():
  weight = model.coef_[0][index]
  if weight>threshold or weight<-threshold:
    print(word,weight)

ha 0.770960130743546
you 0.8811200565612984
then -1.0840406554876332
wa -1.6329000322923797
've 0.8230836146911521
n't -1.9135877896585352
bit 0.6027695931690403
excellent 1.3469598948502153
laptop 0.5247544185041023
value 0.5604015034726559
speaker 0.889353829548618
little 0.9889521131731206
fast 1.0590939871668628
recommend 0.691964467414531
comfortable 0.6672339434850911
unit -0.7200395355889653
sound 0.9596092125434706
space 0.5980132047338175
lot 0.7680679809076206
piece -0.5323795928186885
look 0.5415359113392123
bad -0.7541954084400384
perfect 1.026724889496847
month -0.7561282393672896
video 0.5069755659422291
easy 1.7342810364303136
price 2.8013665933090115
cable 0.7045433923458266
happy 0.5727362562666469
buy -0.8115058666702609
doe -1.091228049022213
quality 1.4388950737415653
time -0.7330515714948431
using 0.5686913242879326
highly 1.0244802234930144
pretty 0.638741197798006
support -0.8657014147125403
waste -0.9752536523350027
company -0.5783913569280855
try -0.63012966638