In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
%matplotlib inline

## Creating a dense dataset using Word2vec

### Loading and preprocessing the data

In [2]:
address = '.\IMDB_Dataset.csv'
imdb = pd.read_csv(address)

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
#     text = text.split()
    text = str(text)
    return text

In [5]:
# apply the text prep to each row of the data frame
imdb.review = imdb.review.apply(lambda x: text_to_word_list(x))

In [6]:
# creating label for evaluation
imdb['sentiment'] = imdb['sentiment'].map({'positive':1,'negative':0})

In [7]:
imdb.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1


## Creating a function to tokenize the text 



In [8]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt');
nltk.download('stopwords');
nltk.download('wordnet');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [89]:
# prep word tokenize the text (lemmatize)
# def prep_word (text):
#     sw = set(stopwords.words("english"))
#     word_tk = word_tokenize(text)
#     words_nonstop = [w for w in word_tk if not w in sw]
#     port_stem = PorterStemmer()
#     lem = WordNetLemmatizer()
#     lemm_words = []
#     stemmed_word = []
#     for w in words_nonstop:
#         stemmed_word.append(port_stem.stem(w))
#     return stemmed_word

# from gensim.models.phrases import Phrases, Phraser
# from gensim.models import Word2Vec
# from gensim.test.utils import get_tmpfile
# from gensim.models import KeyedVectors

# sent = [row for row in imdb.review]
# phrases = Phrases(sent, min_count=1, progress_per=50000)
# bigram = Phraser(phrases)
# sentences = bigram[sent]

In [9]:
# Defined seveal word tokenizing function as seen above, and compared their 
stop_words = set(['all', "she'll", "don't", 'being', 'over', 'through', 
'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should',
"he'd", 'to', 'only', "there's", 'those', 'under', 'ours', 'has', 
"haven't", 'do', 'them', 'his', "they'll", 'very', "who's", "they'd", 
'cannot', "you've", 'they', 'not', 'during', 'yourself', 'him', 'nor', 
"we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', 
"mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'some', 
'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', 
"wasn't", 'does', "shouldn't", 'above', 'between', 'be', 'we', 'who', 
"you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 
'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 
'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', 
"how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 
'themselves', 'was', 'until', 'more', 'himself', "where's", "i've", 'with', 
"didn't", "what's", 'but', 'herself', 'than', "here's", 'he', 'me', 
"they're", 'myself', 'these', "hasn't", 'below', 'ought', 'theirs', 'my', 
"wouldn't", "we'd", 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 
'at', 'have', 'in', 'any', 'if', 'again', 'no', 'that', 'when', 'same', 
'how', 'other', 'which', 'you', "shan't", 'our', 'after', "let's", 'most', 
'such', 'on', "he'll", 'a', 'off', 'i', "she'd", 'yours', "you'll", 'so', 
"we're", "she's", 'the', "that's", 'having', 'once'])
def tokenize(texts):
    compiler = re.compile('[\W_]+', re.UNICODE)
    sentences = []
    for text in texts:
        sentence = text.lower().split(" ")
        sentence = [compiler.sub('', w) for w in sentence]
        sentences.append( [w for w in sentence if w not in stop_words] )
    return sentences

In [10]:
# apply tokenization to each row of the data frame
sentences = tokenize(imdb.review)

In [11]:
sentences[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'will',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'show

### Word2Vec implementation

In [12]:
import multiprocessing
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors



In [13]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)


w2v_model.build_vocab(sentences, progress_per=50000)

In [14]:
#train the model for 30 epoches on entire dataset
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(70501308, 188819400)

In [15]:
# save the model
w2v_model.save("word2vec.model")

In [16]:
# load the model
word_vectors = Word2Vec.load(".\word2vec.model").wv
model =  Word2Vec.load(".\word2vec.model").wv

In [96]:
'''in order to feed the data into the Kmeans, created 
a function to average each row over every single word, as it  
is represented by a vector of size vector_size'''

import numpy as np
def ave_w2v(model, sentences):
    ave_f= np.zeros((len(sentences), model.vector_size))
    for i, sent in enumerate(sentences):
        for word in sent:
            try:
                vector = model[word]
            except KeyError:
                continue
        ave_f[i,:] = ave_f[i,:] + vector
        ave_f[i,:] = ave_f[i,:] / len(sent)
    return ave_f

# Modifying the original code to run with K=3

In [17]:
# averaging the words vector across each row
# ave_w2v = ave_w2v(model, sentences)

# KMeans clustering with max_iter 1000
from sklearn.cluster import KMeans
KMeans = KMeans(n_clusters=3, max_iter=1000, algorithm = 'auto')

fitted = KMeans.fit(word_vectors.vectors.astype('double'))
prediction = KMeans.predict(word_vectors.vectors.astype('double'))

In [19]:
# evaluate the model
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
imdb['sentiment_pred'] = pd.Series(prediction)
# f1 = f1_score(imdb['sentiment'],imdb['sentiment_pred'],pos_label=1)
acc = accuracy_score(imdb['sentiment'],imdb['sentiment_pred'])

print("Accuracy {:.2f}".format(acc))

Accuracy 0.40


## Assigning each word the cluster its own cluster

## Improving the model performance by trying KMeans for each word

In [20]:
#assigning the cluster centers appropirate values
positive = 1
positive_center = KMeans.cluster_centers_[positive]
negative_center = KMeans.cluster_centers_[1-positive]

In [21]:
# examining the shpae of the vector out of word2vec
word_vectors.vectors.shape

(50939, 300)

In [22]:
# creating words datafame from word_vector that is created from word2vec
words = pd.DataFrame(word_vectors.vectors)
words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.194555,0.283483,0.206052,0.195558,-0.098831,-0.118183,0.142301,0.297484,-0.44939,-0.034129,...,0.023032,0.499222,0.348928,0.152259,0.160356,0.169473,0.214394,-0.342808,0.023434,-0.412089
1,-0.14059,0.570906,-0.22287,0.329827,-0.224997,-0.12011,0.102595,0.843669,-0.29465,0.00693,...,-0.229324,0.348344,0.430058,0.268635,0.595714,0.250054,-0.220882,-0.096698,0.106332,-0.566308
2,-0.091227,0.102367,-0.448254,0.266532,-0.280452,-0.038558,0.044803,0.031807,-0.228689,0.25963,...,-0.291684,0.308395,0.261651,0.009782,0.293836,0.20679,0.122776,0.037662,0.161961,-0.473563
3,-0.087425,-0.025102,0.311978,0.431233,-0.191786,-0.133311,-0.105742,0.323807,-0.03046,-0.243507,...,-0.143623,-0.2208,0.370557,0.188152,0.469862,0.290857,0.114241,0.13985,-0.019146,-0.370761
4,-0.360749,0.758457,-0.191592,-0.257975,0.102662,-0.10371,-0.230652,0.303837,0.058728,-0.371631,...,0.066145,-0.186011,-0.132846,0.159813,0.099227,-0.268661,-0.111887,-0.025944,-0.4597,0.283484


In [23]:
# get the key for each word, and putting all vectors in one column for each word
words['words'] = word_vectors.index_to_key
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])

# using the KMeans predict each word cluster and assigning 1 or -1 for each cluster
words['cluster'] = words.vectors.apply(lambda x: KMeans.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_number'] = [1 if i==positive  else -1 for i in words.cluster]

# kmeans.transform(X) returns is already the L2 norm distance to each cluster center,thus a measure of how 
# accurate or closeness the word to the cluster
words['l2_distance'] = words.apply(lambda x: 1/(KMeans.transform([x.vectors]).min()), axis=1)

# calculating score for each word based on their distance to the center, negative number will 
# be from cluster 0 and positive number from cluster 1
words['word_score'] = words.l2_distance * words.cluster_number

In [24]:
words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,words,vectors,cluster,cluster_number,l2_distance,word_score
0,-0.194555,0.283483,0.206052,0.195558,-0.098831,-0.118183,0.142301,0.297484,-0.44939,-0.034129,...,0.214394,-0.342808,0.023434,-0.412089,,"[-0.19455506, 0.28348318, 0.20605221, 0.195558...",0,-1,0.241723,-0.241723
1,-0.14059,0.570906,-0.22287,0.329827,-0.224997,-0.12011,0.102595,0.843669,-0.29465,0.00693,...,-0.220882,-0.096698,0.106332,-0.566308,movie,"[-0.14058958, 0.5709055, -0.22287044, 0.329827...",1,1,0.172073,0.172073
2,-0.091227,0.102367,-0.448254,0.266532,-0.280452,-0.038558,0.044803,0.031807,-0.228689,0.25963,...,0.122776,0.037662,0.161961,-0.473563,film,"[-0.091226876, 0.10236713, -0.44825375, 0.2665...",1,1,0.24368,0.24368
3,-0.087425,-0.025102,0.311978,0.431233,-0.191786,-0.133311,-0.105742,0.323807,-0.03046,-0.243507,...,0.114241,0.13985,-0.019146,-0.370761,one,"[-0.08742523, -0.025102125, 0.31197816, 0.4312...",1,1,0.215025,0.215025
4,-0.360749,0.758457,-0.191592,-0.257975,0.102662,-0.10371,-0.230652,0.303837,0.058728,-0.371631,...,-0.111887,-0.025944,-0.4597,0.283484,like,"[-0.36074874, 0.7584568, -0.19159229, -0.25797...",0,-1,0.197997,-0.197997


In [25]:
# retaining the needed columns and store them into a new data frame
words_trimmed = words[['words', 'vectors', 'cluster', 'cluster_number','l2_distance', 'word_score']]

In [26]:
# put the words columns as index
words_trimmed.set_index(words.words, inplace=True)

In [27]:
words_trimmed.head(15)

Unnamed: 0_level_0,words,vectors,cluster,cluster_number,l2_distance,word_score
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,"[-0.19455506, 0.28348318, 0.20605221, 0.195558...",0,-1,0.241723,-0.241723
movie,movie,"[-0.14058958, 0.5709055, -0.22287044, 0.329827...",1,1,0.172073,0.172073
film,film,"[-0.091226876, 0.10236713, -0.44825375, 0.2665...",1,1,0.24368,0.24368
one,one,"[-0.08742523, -0.025102125, 0.31197816, 0.4312...",1,1,0.215025,0.215025
like,like,"[-0.36074874, 0.7584568, -0.19159229, -0.25797...",0,-1,0.197997,-0.197997
just,just,"[-0.24510749, 0.38719353, -0.011783284, 0.0007...",1,1,0.190104,0.190104
good,good,"[0.19814792, 0.8872247, 0.17918931, 0.14578643...",1,1,0.17422,0.17422
time,time,"[-0.18041916, 0.3175915, 0.27195337, 0.1915056...",1,1,0.177395,0.177395
even,even,"[-0.04309079, 0.20007537, -0.34513903, -0.3730...",1,1,0.194733,0.194733
will,will,"[-0.20239432, 0.4686843, 0.40891212, 0.2796957...",0,-1,0.155112,-0.155112


## Having three clusters, inspecting each cluster for the words

In [29]:
words_trimmed['cluster'].value_counts()

0    31109
1     9923
2     9907
Name: cluster, dtype: int64

In [35]:
words_trimmed['words'][words_trimmed['cluster']==1].head(50)

words
movie              movie
film                film
one                  one
just                just
good                good
time                time
even                even
story              story
really            really
see                  see
can                  can
well                well
much                much
people            people
great              great
also                also
made                made
make                make
way                  way
movies            movies
characters    characters
think              think
watch              watch
films              films
many                many
seen                seen
love                love
never              never
plot                plot
life                life
acting            acting
show                show
best                best
little            little
ever                ever
better            better
end                  end
still              still
say                  say
scenes            s

## Customizng stop words based on the new clustering

In [33]:
stop_words_k0 = words_trimmed['words'][words_trimmed['cluster']==0]
stop_words_k1 = words_trimmed['words'][words_trimmed['cluster']==1]
stop_words_k2 = words_trimmed['words'][words_trimmed['cluster']==2]

## Creating variables across notebooks

In [34]:
%store stop_words_k0
%store stop_words_k1
%store stop_words_k2

Stored 'stop_words_k0' (Series)
Stored 'stop_words_k1' (Series)
Stored 'stop_words_k2' (Series)


### Now I transfer these cutomized stop words to the Sparse data sets

___

### Calculating an average score for each row based on its words scores

In [31]:
new_pred = np.zeros(len(sentences))

In [None]:
# This function iterate through each sentence, and grabs their word score as defined above
# then average over the length of that sentence to calculate a value which is an average score value
# of all the words
for i, row in enumerate(sentences):
    if i%100 == 0:
        print('iteration ', i)
    row_list = []
    for sent in row:
        if words_trimmed['words'].str.contains(sent).any():
            try:
                row_list.append(words_trimmed.loc[sent]['word_score'])
            except KeyError:
                continue
    new_pred[i] = (np.mean(row_list))

In [109]:
# assigning the predictoin into a new column
imdb['new_pred']=pd.Series(new_pred)

In [118]:
imdb['new_pred'] = [1 if i > 0 else 0 for i in imdb.new_pred]

In [119]:
imdb_final = imdb.dropna(axis=0)

In [121]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
f1 = f1_score(imdb_final['sentiment'],imdb_final['new_pred'],pos_label=1)
acc = accuracy_score(imdb_final['sentiment'],imdb_final['new_pred'])

print("Accuracy {:.2f}".format(acc))

Accuracy 0.50
