<a href="https://colab.research.google.com/github/RV05/NLP-ABSA-research-papers-implementation/blob/main/sentiment_analysis_using_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers==3.0.2

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
train = pd.read_csv('/content/drive/MyDrive/AWSLambda_ECR_ABSA-main/train.csv')

In [None]:
train.shape

(4000, 3)

In [None]:
train.head()

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely?,cancelled,1
1,cannot rely on both milk delivery and grocery ...,Milk,0
2,"I get no notification, however the app is real...",notification,0
3,"Love this app, but would love it even more if ...",view,1
4,it does not let me load a clip on the scene,load,0


In [None]:
train.rename(columns={'text':'Phrase','label':'Sentiment'},inplace=True)

In [None]:
train = train[~train['Sentiment'].isin(['2'])]

In [None]:
train['Sentiment'].unique()

array([1, 0])

In [None]:
import re

In [None]:
contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
# Expanding Contractions in the reviews
train['Phrase']=train['Phrase'].apply(lambda x:expand_contractions(x))

In [None]:
import string

In [None]:
train['Phrase'] = train['Phrase'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))


In [None]:
train['Phrase'] = train['Phrase'].apply(lambda x: re.sub('W*dw*','',x))


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
train['Phrase'] = train['Phrase'].apply(lambda x: remove_stopwords(x))

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
train["Phrase"] = train["Phrase"].apply(lambda text: lemmatize_words(text))

In [None]:
train.describe()

Unnamed: 0,Sentiment
count,2974.0
mean,0.435104
std,0.495854
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
new_df = train[['Phrase', 'Sentiment']]

In [None]:
new_df.head()

Unnamed: 0,Phrase,Sentiment
0,check whether cancelle completely,1
1,cannot rely milk elivery grocery item,0
2,I get notification however app really fine,0
3,Love app woul love even Gantt chart Calenar vi...,1
4,oes let loa clip scene,0


In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (2974, 2)
TRAIN Dataset: (2379, 2)
TEST Dataset: (595, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.16546104848384857
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.44it/s]


The Total Accuracy for Epoch 0: 85.24590163934427
Training Loss Epoch: 0.36406288766020894
Training Accuracy Epoch: 85.24590163934427


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.12256880849599838
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.44it/s]


The Total Accuracy for Epoch 1: 87.93610760823876
Training Loss Epoch: 0.295029576780312
Training Accuracy Epoch: 87.93610760823876


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.17654135823249817
Training Accuracy per 5000 steps: 100.0


298it [03:26,  1.44it/s]


The Total Accuracy for Epoch 2: 91.13072719630097
Training Loss Epoch: 0.23931191321587403
Training Accuracy Epoch: 91.13072719630097


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.36869487166404724
Training Accuracy per 5000 steps: 87.5


298it [03:27,  1.44it/s]


The Total Accuracy for Epoch 3: 93.10634720470786
Training Loss Epoch: 0.19190106463099785
Training Accuracy Epoch: 93.10634720470786


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.05012406408786774
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.44it/s]


The Total Accuracy for Epoch 4: 94.45145018915511
Training Loss Epoch: 0.16412808103236876
Training Accuracy Epoch: 94.45145018915511


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.08536649495363235
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.43it/s]


The Total Accuracy for Epoch 5: 95.83858764186633
Training Loss Epoch: 0.11832668563201024
Training Accuracy Epoch: 95.83858764186633


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.019901679828763008
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.43it/s]


The Total Accuracy for Epoch 6: 96.34300126103405
Training Loss Epoch: 0.1129277118062913
Training Accuracy Epoch: 96.34300126103405


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.020480794832110405
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.44it/s]


The Total Accuracy for Epoch 7: 96.04875998318622
Training Loss Epoch: 0.10512372211868806
Training Accuracy Epoch: 96.04875998318622


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.025849690660834312
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.43it/s]


The Total Accuracy for Epoch 8: 96.63724253888188
Training Loss Epoch: 0.08801272626061481
Training Accuracy Epoch: 96.63724253888188


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.017128409817814827
Training Accuracy per 5000 steps: 100.0


298it [03:27,  1.44it/s]

The Total Accuracy for Epoch 9: 97.26775956284153
Training Loss Epoch: 0.0665351572554346
Training Accuracy Epoch: 97.26775956284153





In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    print(n_correct)
    print(n_wrong)
    print(total)
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00,  6.04it/s]

Validation Loss per 100 steps: 0.006345881149172783
Validation Accuracy per 100 steps: 100.0


149it [00:18,  8.01it/s]

471
0
0
Validation Loss Epoch: 1.023852475605133
Validation Accuracy Epoch: 79.15966386554622
Accuracy on test data = 79.16%





In [None]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
