In [274]:
from bs4 import BeautifulSoup as bs
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score

In [142]:
positive_reviews = bs(open('datasets/sentiment/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

In [143]:
negative_reviews = bs(open('datasets/sentiment/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [144]:
np.random.shuffle(positive_reviews)
sizeNr = len(negative_reviews)
positive_reviews = positive_reviews[0:sizeNr]

In [145]:
wordnetlemmatizer = WordNetLemmatizer()
stopwords = [word.strip() for word in open('datasets/stopwords.txt')]
def CustomTokenizer(review):
    #make all reviews lower cased
    review = review.lower()
    #tokenize all words
    review = nltk.tokenize.word_tokenize(review)
    #removing all stop words
    review = [word for word in review if word not in stopwords]
    #removing all words with less than two characters
    review = [word for word in review if len(word)>2]
    #lemmatizing the word
    review = [wordnetlemmatizer.lemmatize(word) for word in review]
    return review

In [146]:
#dictionary indicating words to index
word2idx = {}
positive_tokenized = []
negative_tokenized = []
current = 0
for review in positive_reviews:
    review = CustomTokenizer(review.text)
    review.append(1)
    positive_tokenized.append(review)
    for token in review:
        if token not in word2idx:
            word2idx[token] = current
            current += 1
for review in negative_reviews:
    review = CustomTokenizer(review.text)
    review.append(0)
    negative_tokenized.append(review)
    for token in review:
        if token not in word2idx:
            word2idx[token] = current
            current += 1

In [162]:
#size of the dataset
D = len(negative_tokenized) + len(positive_tokenized)
#size of the vocabulary, 1 added for the label
V = len(word2idx)
#document-term matrix
dt = np.zeros((D,V))
y = np.zeros(D)
combined_reviews = negative_tokenized + positive_tokenized

In [233]:
#populating the document-term matrix
document_num = 0
for review in positive_tokenized:
    for token in review:
        dt[document_num][word2idx[token]] += 1
    dt[document_num] /= sum(dt[document_num])
    y[document_num] = 1
    document_num +=  1    
#populating the document-term matrix
for review in negative_tokenized:
    for token in review:
        dt[document_num][word2idx[token]] += 1
    dt[document_num] /= sum(dt[document_num])    
    y[document_num] = 0
    document_num +=  1    

In [271]:
X_train, X_test, y_train, y_test = train_test_split(dt,y,test_size=0.20)
lg = LogisticRegression()

In [272]:
lg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [273]:
lg.score(X_test,y_test)

0.9503105590062112

In [276]:
cross_val_score(lg,dt,y,cv=5)

array([ 0.9691358 ,  0.99382716,  0.99375   ,  0.98125   ,  0.9875    ])