In [4]:
import os
import shutil
import pandas as pd
import pickle
import numpy as np
import torch
from torch import tensor
from collections import Counter
import pickle
import re

### Glove_6B_50d [download from stanford](http://nlp.stanford.edu/data/glove.6B.zip)

In [12]:
glove_vec={}
with open('glove.6B.50d.txt','r') as f:
    for i in f.read().split('\n')[:-1]:
        word=i.split()[0].lower()
        vec=np.asarray(i.split()[1:],dtype='float32')
        glove_vec[word]=vec

In [13]:
glove_vec['the'].shape

(50,)

### IMDB Dataset [download from kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [14]:
df=pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [15]:
#function to convert reviews into lowercase and remove special characters
def cleanText(df,column):
    for i in range(df.shape[0]):
        sents= str(df.loc[i,column]).lower()
        df.loc[i,column]=re.sub('[^a-zA-Z]+',' ',sents)
    return df

In [16]:
df=cleanText(df,'review')

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


### Creating a list of frequently Unique Words in the reviews.

In [18]:
""" creating vocablary"""

def createVocab(df,column):
    '''Given function pick top 15000 words'''
    words=[]
    for i in df[column]:
        [words.append(w) for w in i.split()]
    word_freq=Counter(words)
    print(len(word_freq))
    vocab=[]
    for k,v in word_freq.items():
        if len(k)>2 and v>=10 and k in glove_vec:
            vocab.append(k)
        if len(vocab)==15000:
            break
    return vocab

vocab=createVocab(df,'review')

99426


In [19]:
print(len(vocab))

15000


In [20]:
""" label encoding words i.e linkinng words in the list to its index"""
word_to_index={}
index_to_word={}
for i,w in enumerate(vocab):
    word_to_index[w]=i+1
    index_to_word[i+1]=w

In [22]:
""" creating embed matrix"""
embed_matrix=torch.zeros(len(vocab)+1,50)
for i,v in index_to_word.items():
    embed_matrix[i]=torch.from_numpy(glove_vec[v])

In [23]:
embed_matrix.shape

torch.Size([15001, 50])

### Converting textual sentences into numeric list based on index of the words in the list

In [24]:
""" coverting reviews into label encoded sentences"""
def sentsEncode(df,column,word_to_index):
    df[column]=df[column].apply(lambda x: [word_to_index[w] for w in x.split() if w in word_to_index])
    return df

In [25]:
df_new=sentsEncode(df,'review',word_to_index)

### Each sentences with length more than 250 is clipped while those with less length are padded with extra zero
#### Each positive review is assigned label 1 and negative review with label 0

In [26]:
total=0
max_len=0
idx=-1
for i in range(df.shape[0]):
    total+=len(df.loc[i,'review'])
    if max_len<len(df.loc[i,'review']):
        max_len=max(max_len,len(df.loc[i,'review']))
        idx=i
print(max_len,idx,total//df.shape[0])

1774 31481 171


In [27]:
maxlen=250

In [28]:
dataset=[]

In [29]:
for i in range(df_new.shape[0]):
    ln= len(df_new.loc[i,'review'])
    label= 1 if df_new.loc[i,'sentiment']=='positive' else 0
    sents=df_new.loc[i,'review'].copy()
    if ln>=maxlen:
        sents=sents[0:maxlen]
    else:
        while(len(sents)!=maxlen):
            sents.append(0)
    dataset.append((sents,label))

In [31]:
df_new['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [32]:
def fixedLength(df,column,maxlen):
    for i in range(df.shape[0]):
        if len(df.loc[i,'review'])>=maxlen:
            df.loc[i,'review']=df.loc[i,'review'][:maxlen]
        else:
            sents=list(np.zeros((maxlen,),dtype='int64'))
            sents=sents[:len(df.loc[i,'review'])]
            df.loc[i,'review']=sents
    return df

### Saving files for further training

In [33]:
with open('index_to_word.pkl','wb') as f:
    pickle.dump(index_to_word,f)
with open('word_to_index.pkl','wb') as f:
    pickle.dump(word_to_index,f)
with open('embed_tensor.pkl','wb') as f:
    pickle.dump(embed_matrix,f)
with open('imdb.pkl','wb') as f:
    pickle.dump(dataset,f)