In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
%matplotlib inline

## Creating a dense dataset using Word2vec

### Loading and preprocessing the data

In [2]:
address = '.\IMDB_Dataset.csv'
imdb = pd.read_csv(address)

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
#    text = text.split()
#     text = str(text)
    return text

In [5]:
# apply the text prep to each row of the data frame
imdb.review = imdb.review.apply(lambda x: text_to_word_list(x))

In [6]:
imdb['sentiment'] = imdb['sentiment'].map({'positive':1,'negative':0})

In [7]:
imdb.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1


## Creating a function to tokenize the text 



In [8]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt');
nltk.download('stopwords');
nltk.download('wordnet');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# prep word tokenize the text (lemmatize)
def prep_word (text):
    sw = set(stopwords.words("english"))
    word_tk = word_tokenize(text)
    words_nonstop = [w for w in word_tk if not w in sw]
    port_stem = PorterStemmer()
    lem = WordNetLemmatizer()
    lemm_words = []
    stemmed_word = []
    for w in words_nonstop:
        stemmed_word.append(port_stem.stem(w))
    return stemmed_word

In [10]:
# apply tokenization to each row of the data frame
imdb.review = imdb.review.apply(lambda x: prep_word(x))

In [11]:
imdb.review.head()

0    [one, review, mention, watch, 1, oz, episod, h...
1    [wonder, littl, product, film, techniqu, unass...
2    [thought, wonder, way, spend, time, hot, summe...
3    [basic, famili, littl, boy, jake, think, zombi...
4    [petter, mattei, love, time, money, visual, st...
Name: review, dtype: object

### Word2Vec implementation

In [12]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(imdb.review, vector_size=300, 
                 window=10, min_count=1,
                 sample=1e-3, workers=2);




In [13]:
#train the model for 30 epoches on entire dataset
#model.train(imdb.review, total_examples=model.corpus_count, epochs=30, report_delay=1)

In [14]:
# save the model
model.save("word2vec.model")

In [15]:
# load the model
model =  Word2Vec.load(".\word2vec.model").wv

In [16]:
'''in order to feed the data into the Kmeans, created 
an function to average each row over the each row as now every single words 
is represented by vector of size vector_size'''

import numpy as np
def ave_w2v(model, sentences):
    ave_f= np.zeros((len(sentences), model.vector_size))
    for i, sent in enumerate(sentences):
        for word in sent:
            try:
                vector = model[word]
            except KeyError:
                continue
        ave_f[i,:] = ave_f[i,:] + vector
        ave_f[i,:] = ave_f[i,:] / len(sent)
    return ave_f

In [17]:
# averaging the words vector across each row
ave_w2v = ave_w2v(model, imdb.review)

# KMeans clustering with max_iter 1000
from sklearn.cluster import KMeans
KMeans = KMeans(n_clusters=2, max_iter=1000, algorithm = 'auto')

fitted = KMeans.fit(ave_w2v)
prediction = KMeans.predict(ave_w2v)

In [18]:
# evaluate the model
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
imdb['sentiment_pred'] = pd.Series(prediction)
f1 = f1_score(imdb['sentiment'],imdb['sentiment_pred'],pos_label=1)
acc = accuracy_score(imdb['sentiment'],imdb['sentiment_pred'])

print("Accuracy {:.2f}".format(acc))

Accuracy 0.52
