In [1]:
import numpy as np
import regex as re
import pandas as pd
import gc
import logging
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import string
import nltk as nlp
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
#from gensim.test.utils import common_texts
#from collections import Counter #like map but worse cuz it senses only the tally --> not for computation :(
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4711559196811733207
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2907098318
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7788326661296675216
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [3]:
print(tf.test.is_built_with_cuda())

True


#### LOADING IMDB DATASET

In [4]:
dataframe = pd.read_csv(r'..\\IMDB Dataset.csv')

In [5]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
dataframe.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
np.shape(dataframe)

(50000, 2)

In [8]:
dataframe['sentiment'] = dataframe['sentiment'].replace('positive', 1)
dataframe['sentiment'] = dataframe['sentiment'].replace('negative', 0)
dataframe.head()
#in case of non-binary classes it makes more sense to use label encoder rather than replace

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### CHECK FOR NULLS AND DUPLICATES

In [9]:
dataframe.isnull().sum()

review       0
sentiment    0
dtype: int64

In [10]:
dataframe.duplicated().sum()

418

In [11]:
dataframe.drop_duplicates(subset='review', keep='first', inplace=True)

In [12]:
np.shape(dataframe)

(49582, 2)

In [13]:
dataframe.duplicated().sum()

0

In [14]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### split into test and train samples

#### REMOVING NON-WORD CHARACTERS FROM THE DATASET

In [15]:
def preprocess(text, pattern):
    if pattern=='[.]+':
        text = re.sub(pattern, '. ', text)
    elif pattern =="[']":
        text =  re.sub(pattern, ' ', text)
    else:
        text = re.sub(pattern, '', text)
    #print(text, '\n')
    return text

In [16]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '<[^>]*>') #remove markup
#print(dataframe.loc[0, 'review'])
#dataframe.head()

In [17]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[.]+') #remove ... and replace with .
#print(dataframe.loc[0, 'review'])
#dataframe.head()

In [18]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[0-9]+') #remove numbers and replace with none
#print(dataframe.loc[0, 'review'])
#dataframe.head()

In [19]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], "[']") #remove ' and replace with 
#print(dataframe.loc[0, 'review'])
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming te...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,"Petter Mattei s ""Love in the Time of Money"" is...",1


In [20]:
gc.collect()

0

In [21]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[^\w\s]*') #remove everything that's not word space or '
# ' is left to handle contractions
#print(dataframe.loc[0, 'review'])
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,Petter Mattei s Love in the Time of Money is a...,1


In [22]:
dataframe['review'] = dataframe['review'].str.lower()#make it lower
#print(dataframe.loc[0, 'review'])
dataframe.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is a...,1


In [23]:
print(dataframe.loc[0, 'review'])

one of the other reviewers has mentioned that after watching just  oz episode you ll be hooked  they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go  trust me this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs sex or violence  its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda  em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare  forget p

#### DOES IT MAKE SENSE TO REMOVE SOME WORDS TO REDUCE COMPUTATION?

In [24]:
count = CountVectorizer()
bag = count.fit_transform(dataframe['review'])
len(count.vocabulary_)

148992

#### Got over 2 Lakh words --> it makes sense to remove some words like articles and prepositions out
#### Better to remove stop words first (Why? --> documentation wip)

In [25]:
#something with tfidf
#question: does it make sense to do tfidf first and then remove stop words using the nltk corpus or 
#remove stop words using the corpus first then perform tfidf next

In [26]:
nlp.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop = set(stopwords.words('english')) #set makes serach O(1)
#originally stopwords.words('english') yields a list
#print(stop)

In [28]:
ps  = PorterStemmer()

In [29]:
def remove_stopwords_and_stem(text):
    text_ = word_tokenize(text)
    tokens = []
    #print(text_)
    for word in text_:
        if word not in stop:
            tokens.append(ps.stem(word))
    #return lemmatization(tokens)
    #print(tokens)
    text = ' '.join(tokens) #send only tokens sent as a joined sentence
    return text 

In [30]:
dataframe['review'] = np.vectorize(remove_stopwords_and_stem)(dataframe['review'])
print(dataframe.loc[0, 'review'])

one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side


In [31]:
lemmatizer = WordNetLemmatizer()
nlp.download('wordnet')
reviews = []

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
def lemmatization(text):
    text_ = word_tokenize(text)
    tokens = []
    for word in text_:
        tokens.append(lemmatizer.lemmatize(word))
    text = ' '.join(tokens)
    reviews.append(tokens)
    return text

In [33]:
dataframe['review'] = np.vectorize(lemmatization)(dataframe['review'])
print(dataframe.loc[0, 'review'])

one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side


#### word2vec

In [34]:
all_reviews = np.array(reviews)
del reviews
del dataframe

  all_reviews = np.array(reviews)


In [35]:
gc.collect()

0

In [36]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [37]:
word2vec_model = Word2Vec(all_reviews, window = 3, min_count = 1, sg = 2, size = 256, workers = 5) #sg --> skipgram
#workers --> number of threads in useb

2021-10-04 18:14:26,677 : INFO : collecting all words and their counts
2021-10-04 18:14:26,677 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-04 18:14:26,946 : INFO : PROGRESS: at sentence #10000, processed 1165655 words, keeping 46765 word types
2021-10-04 18:14:27,194 : INFO : PROGRESS: at sentence #20000, processed 2332830 words, keeping 68413 word types
2021-10-04 18:14:27,457 : INFO : PROGRESS: at sentence #30000, processed 3497506 words, keeping 86270 word types
2021-10-04 18:14:27,733 : INFO : PROGRESS: at sentence #40000, processed 4662105 words, keeping 102404 word types
2021-10-04 18:14:28,001 : INFO : collected 116586 word types from a corpus of 5783805 raw words and 49583 sentences
2021-10-04 18:14:28,005 : INFO : Loading a fresh vocabulary
2021-10-04 18:14:28,189 : INFO : effective_min_count=1 retains 116586 unique words (100% of original 116586, drops 0)
2021-10-04 18:14:28,189 : INFO : effective_min_count=1 leaves 5783805 word corpus (

2021-10-04 18:15:36,688 : INFO : EPOCH 4 - PROGRESS: at 20.41% examples, 368137 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:37,693 : INFO : EPOCH 4 - PROGRESS: at 28.06% examples, 380781 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:38,709 : INFO : EPOCH 4 - PROGRESS: at 35.87% examples, 387748 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:39,718 : INFO : EPOCH 4 - PROGRESS: at 44.43% examples, 400601 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:40,732 : INFO : EPOCH 4 - PROGRESS: at 52.99% examples, 409315 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:41,757 : INFO : EPOCH 4 - PROGRESS: at 61.68% examples, 416274 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:42,782 : INFO : EPOCH 4 - PROGRESS: at 70.61% examples, 422931 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:43,783 : INFO : EPOCH 4 - PROGRESS: at 79.92% examples, 431033 words/s, in_qsize 9, out_qsize 0
2021-10-04 18:15:44,786 : INFO : EPOCH 4 - PROGRESS: at 88.74% examples, 435934 words/s, in_qsiz

In [38]:
#need to save model here
word2vec_model.wv.save_word2vec_format('../word_embeddings.txt')

2021-10-04 18:15:57,044 : INFO : storing 116586x256 projection weights into ../word_embeddings.txt


In [39]:
word2vec_model = KeyedVectors.load_word2vec_format('../word_embeddings.txt', binary = False, unicode_errors = 'ignore')

2021-10-04 18:16:27,396 : INFO : loading projection weights from ../word_embeddings.txt
2021-10-04 18:16:43,088 : INFO : loaded (116586, 256) matrix from ../word_embeddings.txt


In [40]:
word2vec_model.wv.similarity('saw', 'may')

  word2vec_model.wv.similarity('saw', 'may')


0.44250163

In [41]:
word2vec_model.wv.similarity('saw', 'say')

  word2vec_model.wv.similarity('saw', 'say')


0.43232653

In [42]:
word2vec_model.wv.similarity('say', 'may')

  word2vec_model.wv.similarity('say', 'may')


0.45469904

In [43]:
word2vec_model.wv.similarity('gangsta', 'latino')

  word2vec_model.wv.similarity('gangsta', 'latino')


0.6966977

In [44]:
len(word2vec_model.wv.vocab)

  len(word2vec_model.wv.vocab)


116586

In [45]:
gc.collect()

0

#### load dictionary of word to vectors --> from gensim instance to dictionary

In [46]:
embedding = {}
with open('../word_embeddings.txt', encoding = 'utf-8') as f:
    #page = f.read()
    for line in f:
        record = line.split()
        #print(record[0])
        #word = record[0]
        embedding[record[0]] = np.asarray(record[1:])        

In [47]:
print(len(embedding))

116587


In [48]:
gc.collect()

0

#### simple rnn

#### metrics