In [130]:
import torch
import torch.nn as nn
import pandas as pd
from bs4 import BeautifulSoup
import re
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
import math

In [100]:
data = pd.read_csv("IMDB_Dataset.csv")


In [101]:
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Apply the cleaning function to the review column
data['cleaned_review'] = data['review'].apply(clean_text)


  text = BeautifulSoup(text, "html.parser").get_text()


In [102]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert sentiment labels to numerical format
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [103]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.cleaned_review
        self.targets = self.data.sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [104]:
max_len = 512  # You can adjust this depending on your specific needs

# Create an instance of the dataset
dataset = SentimentDataset(data, tokenizer, max_len)

# Example: Creating a DataLoader for training
train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # Adjust batch size as needed


In [105]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, 2)  # Assuming binary classification

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [106]:
ntokens = len(tokenizer.vocab)  # size of vocabulary
emsize = 200  # embedding dimension
nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0.2  # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)



In [107]:
iter_loader = iter(train_dataloader)
first_batch = next(iter_loader)

In [108]:
input = first_batch['ids']
mask = first_batch['mask']
target = first_batch['targets']

In [109]:
input.shape

torch.Size([16, 512])

In [110]:
src_mask = model.generate_square_subsequent_mask(input.size(0))

In [111]:
src_mask.shape

torch.Size([16, 16])

In [112]:
out = model(input, src_mask)
out = out[:,0,:]
out.shape

torch.Size([16, 2])

In [113]:
target.shape

torch.Size([16])

In [114]:
loss = criterion(out , target)

In [116]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

for epoch in range(10):  # Num of epochs
    model.train()
    total_loss = 0.
    for i, batch in enumerate(train_dataloader):
        inputs, targets = batch['ids'], batch['targets']
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(inputs.size(0))
        output = model(inputs, src_mask)
        output = output[:,0,:]
        loss = criterion(output, targets.long())
        loss.backward()
        optimizer.step()

        loss_value = loss.item()
        # log_interval = 200
        # if i % log_interval == 0 and i > 0:
        # cur_loss = total_loss / log_interval
        print('| epoch {:3d} | {:5d}/{:5d} iteration | '
                  'loss {} |'.format(epoch+1, i, len(train_dataloader), loss_value))


| epoch   1 |     0/ 3125 iteration | loss 0.6807945966720581 |
| epoch   1 |     1/ 3125 iteration | loss 0.8245986700057983 |
| epoch   1 |     2/ 3125 iteration | loss 0.9267174005508423 |
| epoch   1 |     3/ 3125 iteration | loss 0.7987486124038696 |
| epoch   1 |     4/ 3125 iteration | loss 0.8654099106788635 |
| epoch   1 |     5/ 3125 iteration | loss 0.9166464805603027 |
| epoch   1 |     6/ 3125 iteration | loss 0.6894135475158691 |
| epoch   1 |     7/ 3125 iteration | loss 0.7799966931343079 |
| epoch   1 |     8/ 3125 iteration | loss 0.7373149394989014 |
| epoch   1 |     9/ 3125 iteration | loss 0.7779186367988586 |
| epoch   1 |    10/ 3125 iteration | loss 0.7196653485298157 |
| epoch   1 |    11/ 3125 iteration | loss 0.6378631591796875 |
| epoch   1 |    12/ 3125 iteration | loss 0.7123527526855469 |
| epoch   1 |    13/ 3125 iteration | loss 0.663116991519928 |
| epoch   1 |    14/ 3125 iteration | loss 0.8183684945106506 |
| epoch   1 |    15/ 3125 iteration | los

KeyboardInterrupt: 

In [148]:
sentence = "fucking diaster"

# Interpret the predicted class
enc_sen = tokenizer.encode(sentence)
tensor_enc_sen = torch.LongTensor(enc_sen)
tensor_enc_sen_unsqueeze = tensor_enc_sen.unsqueeze(0)
mask_tensor_enc_sen = model.generate_square_subsequent_mask(tensor_enc_sen_unsqueeze.size(0))

In [149]:
out = model(tensor_enc_sen_unsqueeze , mask_tensor_enc_sen)

In [150]:
prediction = out[:,0,:]
prediction

tensor([[0.0819, 0.3412]], grad_fn=<SliceBackward0>)

In [151]:
probabilities = softmax(prediction , dim = 1)
probabilities

tensor([[0.4355, 0.5645]], grad_fn=<SoftmaxBackward0>)

In [152]:
predicted_class = torch.argmax(probabilities, dim=1).item()

In [153]:
predicted_class

1

In [154]:
if predicted_class == 0:
    print("The sentence is classified as negative.")
else:
    print("The sentence is classified as positive.")

The sentence is classified as positive.
