In [22]:
import numpy as np
import pandas as pd
import io
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math
from sklearn import model_selection
from sklearn import metrics
import torch
import torch.nn as nn
import tensorflow as tf  # pytorch for the model, tensorflow for tokenizer
import gensim
from gensim.models import KeyedVectors
from dataset import IMDBDataset

In [23]:
print(torch.__version__)
torch.cuda.get_device_name(0)

1.13.1+cu117


'NVIDIA GeForce RTX 3050 Ti Laptop GPU'

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [25]:
# Read data
df = pd.read_csv('IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [26]:
# Convert sentiment columns to numerical values
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0) # only positive or negative
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [27]:
## Cross validation 
# create new column "kfold" and assign a random value
df['kfold'] = -1  # all rows have df['kfold'] = -1

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
df.head(5)

Unnamed: 0,review,sentiment,kfold
0,You can tell that this is the first offering b...,1,-1
1,A Cryptozoologist captures a mythical chupacab...,0,-1
2,Forgettable pilot that never really explains w...,0,-1
3,Quite simply this shouldn't have been made. It...,0,-1
4,Yes this a B- grade horror. But at least the p...,1,-1


In [28]:
# get label
y = df['sentiment'].values
y

array([1, 0, 0, ..., 1, 1, 1], dtype=int64)

In [29]:
# K-FOLD CROSS VALIDATION SETUP
kf = model_selection.StratifiedKFold(n_splits=5)

# assign folds to [0, 1, 2, 3, 4]
for fold, (train_, valid_) in enumerate(kf.split(X=df, y=y)):
    df.loc[valid_, 'kfold'] = fold

print("Total number of rows: ", df.shape[0])
print("Number of rows with kfold = 0", df[df.kfold==0].shape[0])
print("Number of rows with kfold = 1", df[df.kfold==1].shape[0])
print("Number of rows with kfold = 2", df[df.kfold==2].shape[0])
print("Number of rows with kfold = 3", df[df.kfold==3].shape[0])
print("Number of rows with kfold = 4", df[df.kfold==4].shape[0])

Total number of rows:  50000
Number of rows with kfold = 0 10000
Number of rows with kfold = 1 10000
Number of rows with kfold = 2 10000
Number of rows with kfold = 3 10000
Number of rows with kfold = 4 10000


In [30]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [31]:
#load fasttext embeddings
print('loading word embeddings...')
word_vectors =  gensim.downloader.load('fasttext-wiki-news-subwords-300')

loading word embeddings...


In [32]:
print(len(word_vectors))
word_vectors["computer"].shape  # 300 dimensions

999999


(300,)

In [33]:
class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTM, self).__init__()
        # Number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]
        # Dimension of embedding is num of columns in the matrix
        embedding_dim = embedding_matrix.shape[1]
        # Define an input embedding layer
        self.embedding = nn.Embedding(num_embeddings=num_words,
                                      embedding_dim=embedding_dim)
        # Embedding matrix actually is collection of parameter
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype = torch.float32))
        # Because we use pretrained embedding (GLove, Fastext,etc) so we turn off requires_grad-meaning we do not train gradient on embedding weight
        self.embedding.weight.requires_grad = False
        # LSTM with hidden_size = 128
        self.lstm = nn.LSTM(
                            embedding_dim, 
                            128,
                            bidirectional=True,
                            batch_first=True,
                             )
        # Input(512) because we use bi-directional LSTM ==> hidden_size*2 + maxpooling **2  = 128*4 = 512, will be explained more on forward method
        self.out = nn.Linear(512, 1)
    def forward(self, x):
        # pass input (tokens) through embedding layer
        x = self.embedding(x)
        # fit embedding to LSTM
        hidden, _ = self.lstm(x)
        # apply mean and max pooling on lstm output
        avg_pool= torch.mean(hidden, 1)
        max_pool, index_max_pool = torch.max(hidden, 1)
        # concat avg_pool and max_pool (so we have 256 size, also because this is bidirectional ==> 256*2 = 512)
        out = torch.cat((avg_pool, max_pool), 1)
        # fit out to self.out to conduct dimensionality reduction from 512 to 1
        out = self.out(out)
        # return output
        return out

In [42]:
def train(data_loader, model, optimizer, device):
    """
    this is model training for one epoch
    data_loader:  this is torch dataloader, just like dataset but in torch and devide into batches
    model : lstm
    optimizer : torch optimizer : adam
    device:  cuda or cpu
    """
    # set model to training mode
    model.train()
    # go through batches of data in data loader
    for data in data_loader:
        reviews = data['review']
        targets = data['target']
        # move the data to device that we want to use
        reviews = reviews.to(device, dtype = torch.long)
        targets = targets.to(device, dtype = torch.float)
        # clear the gradient
        optimizer.zero_grad()
        # make prediction from model
        predictions = model(reviews)
        # caculate the losses
        loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1,1))
        # backprob
        loss.backward()
        #single optimization step
        optimizer.step()

In [43]:
def evaluate(data_loader, model, device):
    final_predictions = []
    final_targets = []
    model.eval()
    # turn off gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data['review']
            targets = data['target']
            reviews = reviews.to(device, dtype = torch.long)
            targets = targets.to(device, dtype=torch.float)
            # make prediction
            predictions = model(reviews)
            # move prediction and target to cpu
            predictions = predictions.cpu().numpy().tolist()
            targets = data['target'].cpu().numpy().tolist()
            # add predictions to final_prediction
            final_predictions.extend(predictions)
            final_targets.extend(targets)
    return final_predictions, final_targets

In [44]:
MAX_LEN = 128 # maximum length for a sentence
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5

In [45]:
def create_embedding_matrix(word_index, embedding_dict=None, dim=300):
    """
     this function create the embedding matrix save in numpy array
    :param word_index: a dictionary with word: index_value
    :param embedding_dict: a dict with word embedding
    :d_model: the dimension of word pretrained embedding
    :return a numpy array with embedding vectors for all known words
    """
    embedding_matrix = np.zeros((len(word_index) + 1, dim))
    ## loop over all the words
    for word, index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
    return embedding_matrix

# embedding_dict['word'] = vector
# word_index['word'] = index
# embedding_matrix[index] = vector

In [46]:
df['review'].values.tolist()[:5] # list of sentences

['You can tell that this is the first offering by the Director (who also wrote it), but you can also see the potential this guy has. This is an obviously low budget film in the spirit of Boondock Saints. Of course, Boondock Saints came out a few years after this, so you could look at this as a diamond in need of some polish. The acting was good - if you\'re looking for DeNiro or Michael Madsen in a crime drama, remember that these are young guys, playing young guys trying to be criminals. They\'re not going to be "supercool" (tm) like some of the veterans. I would have love to have seen Justin Pagel (Joe - the main character) go on to make more movies - he was great in this. Good movie - 3 stars out of 5.',
 'A Cryptozoologist captures a mythical chupacabra on a Caribbean island.To get it back to civilization he bribes his way onto the cargo bay of a large luxury cruise ship with funny and I think the script intended disastrous results.<br /><br />Lets start with the one thing I really

In [47]:
# STEP 1: Tokenization
# use tf.keras for tokenization,  
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['review'].values.tolist())

In [48]:
# 0 is reserved for padding
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [49]:
print('Load fasttext embedding')
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict=word_vectors, dim=300)


for fold in range(5):
    # STEP 2: cross validation
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    # STEP 3: pad sequence
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)
    
    # zero padding
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)
    
    # STEP 4: initialize dataset class for training
    train_dataset = IMDBDataset(reviews=xtrain, targets=train_df['sentiment'].values)
    
    # STEP 5: Load dataset to Pytorch DataLoader
    # after we have train_dataset, we create a torch dataloader to load train_dataset class based on specified batch_size
    train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, num_workers=2)
    # initialize dataset class for validation
    valid_dataset = IMDBDataset(reviews=xtest, targets=valid_df['sentiment'].values)
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = VALID_BATCH_SIZE, num_workers=1)
    
    # STEP 6: Running 
    device = torch.device('cuda')
    # feed embedding matrix to lstm
    model_fasttext = LSTM(embedding_matrix)
    # set model to cuda device
    model_fasttext.to(device)
    # initialize Adam optimizer
    optimizer = torch.optim.Adam(model_fasttext.parameters(), lr=1e-3)
    
    print('training model')

    for epoch in tqdm(range(EPOCHS)):
        # train one epoch
        train(train_data_loader, model_fasttext, optimizer, device)
        # validate
        outputs, targets = evaluate(valid_data_loader, model_fasttext, device)
        # threshold
        outputs = np.array(outputs) >= 0.5
        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f'FOLD:{fold}, epoch: {epoch}, accuracy_score: {accuracy}')

Load fasttext embedding
training model


 20%|██        | 1/5 [00:17<01:10, 17.74s/it]

FOLD:0, epoch: 0, accuracy_score: 0.8613


 40%|████      | 2/5 [00:35<00:53, 17.86s/it]

FOLD:0, epoch: 1, accuracy_score: 0.8788


 60%|██████    | 3/5 [00:52<00:35, 17.58s/it]

FOLD:0, epoch: 2, accuracy_score: 0.8814


 80%|████████  | 4/5 [01:09<00:17, 17.36s/it]

FOLD:0, epoch: 3, accuracy_score: 0.8831


100%|██████████| 5/5 [01:27<00:00, 17.44s/it]

FOLD:0, epoch: 4, accuracy_score: 0.8808





training model


 20%|██        | 1/5 [00:17<01:08, 17.08s/it]

FOLD:1, epoch: 0, accuracy_score: 0.8522


 40%|████      | 2/5 [00:34<00:52, 17.53s/it]

FOLD:1, epoch: 1, accuracy_score: 0.8688


 60%|██████    | 3/5 [00:52<00:35, 17.66s/it]

FOLD:1, epoch: 2, accuracy_score: 0.8757


 80%|████████  | 4/5 [01:09<00:17, 17.43s/it]

FOLD:1, epoch: 3, accuracy_score: 0.8778


100%|██████████| 5/5 [01:27<00:00, 17.42s/it]

FOLD:1, epoch: 4, accuracy_score: 0.8808





training model


 20%|██        | 1/5 [00:17<01:08, 17.11s/it]

FOLD:2, epoch: 0, accuracy_score: 0.8622


 40%|████      | 2/5 [00:34<00:51, 17.13s/it]

FOLD:2, epoch: 1, accuracy_score: 0.879


 60%|██████    | 3/5 [00:51<00:34, 17.21s/it]

FOLD:2, epoch: 2, accuracy_score: 0.8903


 80%|████████  | 4/5 [01:08<00:17, 17.24s/it]

FOLD:2, epoch: 3, accuracy_score: 0.8923


100%|██████████| 5/5 [01:26<00:00, 17.27s/it]

FOLD:2, epoch: 4, accuracy_score: 0.8941





training model


 20%|██        | 1/5 [00:17<01:09, 17.34s/it]

FOLD:3, epoch: 0, accuracy_score: 0.847


 40%|████      | 2/5 [00:34<00:51, 17.29s/it]

FOLD:3, epoch: 1, accuracy_score: 0.8604


 60%|██████    | 3/5 [00:51<00:34, 17.33s/it]

FOLD:3, epoch: 2, accuracy_score: 0.8698


 80%|████████  | 4/5 [01:09<00:17, 17.21s/it]

FOLD:3, epoch: 3, accuracy_score: 0.8771


100%|██████████| 5/5 [01:26<00:00, 17.23s/it]

FOLD:3, epoch: 4, accuracy_score: 0.8784





training model


 20%|██        | 1/5 [00:17<01:10, 17.71s/it]

FOLD:4, epoch: 0, accuracy_score: 0.6997


 40%|████      | 2/5 [00:34<00:52, 17.40s/it]

FOLD:4, epoch: 1, accuracy_score: 0.7478


 60%|██████    | 3/5 [00:51<00:34, 17.23s/it]

FOLD:4, epoch: 2, accuracy_score: 0.7773


 80%|████████  | 4/5 [01:10<00:17, 17.59s/it]

FOLD:4, epoch: 3, accuracy_score: 0.7803


100%|██████████| 5/5 [01:28<00:00, 17.63s/it]

FOLD:4, epoch: 4, accuracy_score: 0.7807



