In [79]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
%matplotlib inline

## Creating a dense dataset using Word2vec

### Loading and preprocessing the data

In [80]:
address = '.\IMDB_Dataset.csv'
imdb = pd.read_csv(address)

In [81]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [82]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
#     text = text.split()
    text = str(text)
    return text

In [83]:
# apply the text prep to each row of the data frame
imdb.review = imdb.review.apply(lambda x: text_to_word_list(x))

In [99]:
# creating label for evaluation
imdb['sentiment'] = imdb['sentiment'].map({'positive':1,'negative':0})

In [9]:
imdb.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1


## Creating a function to tokenize the text 



In [10]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt');
nltk.download('stopwords');
nltk.download('wordnet');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [89]:
# prep word tokenize the text (lemmatize)
# def prep_word (text):
#     sw = set(stopwords.words("english"))
#     word_tk = word_tokenize(text)
#     words_nonstop = [w for w in word_tk if not w in sw]
#     port_stem = PorterStemmer()
#     lem = WordNetLemmatizer()
#     lemm_words = []
#     stemmed_word = []
#     for w in words_nonstop:
#         stemmed_word.append(port_stem.stem(w))
#     return stemmed_word

# from gensim.models.phrases import Phrases, Phraser
# from gensim.models import Word2Vec
# from gensim.test.utils import get_tmpfile
# from gensim.models import KeyedVectors

# sent = [row for row in imdb.review]
# phrases = Phrases(sent, min_count=1, progress_per=50000)
# bigram = Phraser(phrases)
# sentences = bigram[sent]

In [87]:
# Defined seveal word tokenizing function as seen above, and compared their 
stop_words = set(['all', "she'll", "don't", 'being', 'over', 'through', 
'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should',
"he'd", 'to', 'only', "there's", 'those', 'under', 'ours', 'has', 
"haven't", 'do', 'them', 'his', "they'll", 'very', "who's", "they'd", 
'cannot', "you've", 'they', 'not', 'during', 'yourself', 'him', 'nor', 
"we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', 
"mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'some', 
'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', 
"wasn't", 'does', "shouldn't", 'above', 'between', 'be', 'we', 'who', 
"you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 
'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 
'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', 
"how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 
'themselves', 'was', 'until', 'more', 'himself', "where's", "i've", 'with', 
"didn't", "what's", 'but', 'herself', 'than', "here's", 'he', 'me', 
"they're", 'myself', 'these', "hasn't", 'below', 'ought', 'theirs', 'my', 
"wouldn't", "we'd", 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 
'at', 'have', 'in', 'any', 'if', 'again', 'no', 'that', 'when', 'same', 
'how', 'other', 'which', 'you', "shan't", 'our', 'after', "let's", 'most', 
'such', 'on', "he'll", 'a', 'off', 'i', "she'd", 'yours', "you'll", 'so', 
"we're", "she's", 'the', "that's", 'having', 'once'])
def tokenize(texts):
    compiler = re.compile('[\W_]+', re.UNICODE)
    sentences = []
    for text in texts:
        sentence = text.lower().split(" ")
        sentence = [compiler.sub('', w) for w in sentence]
        sentences.append( [w for w in sentence if w not in stop_words] )
    return sentences

In [88]:
# apply tokenization to each row of the data frame
sentences = tokenize(imdb.review)

In [90]:
sentences[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'will',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'show

### Word2Vec implementation

In [91]:
import multiprocessing
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [92]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)


w2v_model.build_vocab(sentences, progress_per=50000)

In [93]:
#train the model for 30 epoches on entire dataset
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(70505845, 188819400)

In [94]:
# save the model
w2v_model.save("word2vec.model")

In [95]:
# load the model
word_vectors = Word2Vec.load(".\word2vec.model").wv
model =  Word2Vec.load(".\word2vec.model").wv

In [96]:
'''in order to feed the data into the Kmeans, created 
a function to average each row over every single word, as it  
is represented by a vector of size vector_size'''

import numpy as np
def ave_w2v(model, sentences):
    ave_f= np.zeros((len(sentences), model.vector_size))
    for i, sent in enumerate(sentences):
        for word in sent:
            try:
                vector = model[word]
            except KeyError:
                continue
        ave_f[i,:] = ave_f[i,:] + vector
        ave_f[i,:] = ave_f[i,:] / len(sent)
    return ave_f

In [97]:
# averaging the words vector across each row
ave_w2v = ave_w2v(model, sentences)

# KMeans clustering with max_iter 1000
from sklearn.cluster import KMeans
KMeans = KMeans(n_clusters=2, max_iter=1000, algorithm = 'auto')

fitted = KMeans.fit(ave_w2v)
prediction = KMeans.predict(ave_w2v)

In [100]:
# evaluate the model
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
imdb['sentiment_pred'] = pd.Series(prediction)
f1 = f1_score(imdb['sentiment'],imdb['sentiment_pred'],pos_label=1)
acc = accuracy_score(imdb['sentiment'],imdb['sentiment_pred'])

print("Accuracy {:.2f}".format(acc))

Accuracy 0.51


## Improving the model performance by trying KMeans for each word

In [101]:
#assigning the cluster centers appropirate values
positive = 1
positive_center = KMeans.cluster_centers_[positive]
negative_center = KMeans.cluster_centers_[1-positive]

In [102]:
# examining the shpae of the vector out of word2vec
word_vectors.vectors.shape

(50939, 300)

In [103]:
# creating words datafame from word_vector that is created from word2vec
words = pd.DataFrame(word_vectors.vectors)
words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.25663,-0.072012,0.077007,0.114384,-0.028004,-0.185907,-0.095212,0.229779,-0.31179,-0.468198,...,0.012735,0.532663,0.120021,0.128566,-0.12342,0.454978,-0.136039,-0.143117,0.020477,-0.41228
1,-0.393435,0.664386,0.059483,0.0663,-0.195026,-0.115573,-0.212024,0.826557,0.112265,-0.020417,...,0.291248,0.385556,0.644949,0.510316,0.302584,0.324675,-0.370578,0.277162,-0.087903,-0.399293
2,-0.321017,0.398134,-0.613452,0.271376,-0.426472,-0.106868,-0.118872,0.365422,0.331852,0.085577,...,-0.085237,0.028916,0.472266,-0.004378,-0.037832,0.341157,-0.043356,0.547153,0.121381,-0.305124
3,-0.03885,-0.062278,-0.09006,-0.303378,-0.479485,-0.075731,-0.313882,0.413343,0.344797,-0.279257,...,-0.1414,0.044314,0.507342,0.21036,0.454749,0.480581,-0.061156,0.357697,0.13531,-0.096451
4,-0.375018,0.58095,-0.019273,-0.160223,0.465614,-0.323928,-0.545703,0.6324,-0.254073,-0.04486,...,0.301077,0.014529,0.155383,0.098041,0.049482,-0.128783,0.061227,-0.272371,-0.200854,0.195803


In [104]:
# get the key for each word, and putting all vectors in one column for each word
words['words'] = word_vectors.index_to_key
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])

# using the KMeans predict each word cluster and assigning 1 or -1 for each cluster
words['cluster'] = words.vectors.apply(lambda x: KMeans.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_number'] = [1 if i==positive  else -1 for i in words.cluster]

# kmeans.transform(X) returns is already the L2 norm distance to each cluster center,thus a measure of how 
# accurate or closeness the word to the cluster
words['l2_distance'] = words.apply(lambda x: 1/(KMeans.transform([x.vectors]).min()), axis=1)

# calculating score for each word based on their distance to the center, negative number will 
# be from cluster 0 and positive number from cluster 1
words['word_score'] = words.l2_distance * words.cluster_number

In [27]:
words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,words,vectors,cluster,cluster_number,l2_distance,word_score
0,-0.384402,0.323972,0.018749,0.517974,-0.128163,-0.113358,0.091611,0.294123,0.084,-0.306493,...,-0.058121,-0.062115,0.125119,-0.604672,,"[-0.38440162, 0.3239717, 0.018748954, 0.517973...",1,1,0.243873,0.243873
1,-0.385377,0.796786,0.184321,0.21895,-0.170296,0.079719,-0.26783,0.676184,0.250768,0.02129,...,-0.213938,0.289156,-0.256425,-0.893538,movie,"[-0.38537654, 0.79678607, 0.18432134, 0.218949...",1,1,0.167779,0.167779
2,-0.214271,0.417396,-0.340524,0.057149,-0.29485,0.101731,-0.122862,0.53359,0.166588,-0.329179,...,0.181697,0.639468,-0.335981,-0.638485,film,"[-0.21427089, 0.41739598, -0.34052396, 0.05714...",1,1,0.193551,0.193551
3,-0.091152,-0.034703,-0.078999,-0.215078,0.003923,0.221269,-0.246913,0.225454,0.260562,-0.211778,...,-0.212288,0.098784,-0.195181,-0.585313,one,"[-0.091151565, -0.03470321, -0.07899916, -0.21...",1,1,0.224714,0.224714
4,-0.456848,0.414765,0.12727,-0.324015,0.304075,-0.02684,-0.021117,0.417057,-0.468053,-0.205694,...,-0.185872,-0.014593,-0.12653,0.136474,like,"[-0.4568476, 0.41476494, 0.12726973, -0.324014...",1,1,0.196834,0.196834


In [105]:
# retaining the needed columns and store them into a new data frame
words_trimmed = words[['words', 'vectors', 'cluster', 'cluster_number','l2_distance', 'word_score']]

In [106]:
# put the words columns as index
words_trimmed.set_index(words.words, inplace=True)

In [107]:
words_trimmed.head(15)

Unnamed: 0_level_0,words,vectors,cluster,cluster_number,l2_distance,word_score
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,"[-0.25662994, -0.07201213, 0.07700688, 0.11438...",1,1,0.247434,0.247434
movie,movie,"[-0.39343548, 0.66438645, 0.05948274, 0.066300...",1,1,0.165659,0.165659
film,film,"[-0.3210167, 0.39813447, -0.6134517, 0.2713761...",1,1,0.197589,0.197589
one,one,"[-0.03885012, -0.062278494, -0.090060316, -0.3...",1,1,0.223871,0.223871
like,like,"[-0.37501764, 0.58095044, -0.01927281, -0.1602...",1,1,0.197565,0.197565
just,just,"[-0.3624778, 0.69185585, 0.11682977, -0.361674...",1,1,0.192452,0.192452
good,good,"[0.20961219, 0.54847515, 0.17450707, 0.2739509...",1,1,0.175222,0.175222
time,time,"[-0.26528504, 0.103988476, -0.16395175, -0.473...",1,1,0.177211,0.177211
even,even,"[-0.46085772, 0.59741426, -0.38659415, -0.2617...",1,1,0.209586,0.209586
will,will,"[-0.6121801, 0.40187812, 0.3320719, 0.2767503,...",1,1,0.15735,0.15735


### Calculating an average score for each row based on its words scores

In [31]:
new_pred = np.zeros(len(sentences))

In [None]:
# This function iterate through each sentence, and grabs their word score as defined above
# then average over the length of that sentence to calculate a value which is an average score value
# of all the words
for i, row in enumerate(sentences):
    if i%100 == 0:
        print('iteration ', i)
    row_list = []
    for sent in row:
        if words_trimmed['words'].str.contains(sent).any():
            try:
                row_list.append(words_trimmed.loc[sent]['word_score'])
            except KeyError:
                continue
    new_pred[i] = (np.mean(row_list))

In [109]:
# assigning the predictoin into a new column
imdb['new_pred']=pd.Series(new_pred)

In [118]:
imdb['new_pred'] = [1 if i > 0 else 0 for i in imdb.new_pred]

In [119]:
imdb_final = imdb.dropna(axis=0)

In [121]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
f1 = f1_score(imdb_final['sentiment'],imdb_final['new_pred'],pos_label=1)
acc = accuracy_score(imdb_final['sentiment'],imdb_final['new_pred'])

print("Accuracy {:.2f}".format(acc))

Accuracy 0.50
