# Import Libraries

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from tokenizers import Tokenizer, trainers, pre_tokenizers, models
from torch.utils.data import Dataset, DataLoader, random_split
import re
from tqdm import tqdm
from sklearn.metrics import f1_score

In [3]:
a=torch.tensor(1)

In [8]:
int(a)

1

In [99]:
def text_normalize(text):
    text= re.sub('[^A-Za-z0-9]+', ' ', text)
    return text

In [100]:
ds_path= 'Dataset/Laptop_Train_v2.csv'

df = pd.read_csv(ds_path)
df


Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,conflict,24,30
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


In [102]:
df.Sentence= df.Sentence.apply(lambda x: text_normalize(x).lower())
df['Aspect Term']= df['Aspect Term'].apply(lambda x: text_normalize(x).lower())
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,i charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,i charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,the tech guy then said the service center does...,service center,negative,27,41
3,1316,the tech guy then said the service center does...,sales team,negative,109,121
4,1316,the tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,we also use paralles so we can run virtual mac...,windows server enterprise 2003,neutral,104,134
2354,2272,we also use paralles so we can run virtual mac...,windows server 2008 enterprise,neutral,140,170
2355,848,how toshiba handles the repair seems to vary s...,repair,conflict,24,30
2356,848,how toshiba handles the repair seems to vary s...,repair,positive,130,136


In [107]:
data={}
data['sentence']= df.Sentence
data['aspect_term']= df['Aspect Term']
data['polarity']= df['polarity']

In [170]:
for sentence, aspect_term, polarity in zip(data['sentence'],data['aspect_term'],data['polarity']):
    print(sentence)
    print(polarity)
    print(aspect_term)
    break

i charge it at night and skip taking the cord with me because of the good battery life
neutral
cord


In [227]:
a,b = random_split(data,(2,1) )

In [171]:
corpus =df.Sentence.values

In [172]:
corpus

array(['i charge it at night and skip taking the cord with me because of the good battery life',
       'i charge it at night and skip taking the cord with me because of the good battery life',
       'the tech guy then said the service center does not do 1to1 exchange and i have to direct my concern to the sales team which is the retail shop which i bought my netbook from',
       ...,
       'how toshiba handles the repair seems to vary some folks  indicate that they were charged for even an intial fix others had the  repair done 5 times',
       'how toshiba handles the repair seems to vary some folks  indicate that they were charged for even an intial fix others had the  repair done 5 times',
       'i would like to use a different operating system altogether'],
      dtype=object)

In [209]:
seq_len = max([len(sentence.split(' ')) for sentence in data['sentence']])
seq_len

77

In [None]:
model = models.WordLevel(unk_token='<unk>') 
tokenizer= Tokenizer(model)
tokenizer.pre_tokenizer= pre_tokenizers.Whitespace()
trainer=trainers.WordLevelTrainer(vocab_size=20000, special_tokens = ['<unk>','<pad>','<sos>','<eos>','<sep>'])
tokenizer.train_from_iterator(corpus, trainer)

In [12]:
def pad_and_truncate(sample, seq_len, pad_idx):
    if len(sample) >= seq_len:
        sample = sample[:seq_len]
    
    else:
        sample= sample + [pad_idx]*(seq_len-len(sample))
    
    return sample

In [176]:
def vectorize(data, tokenizer, seq_len):
    input_ids=[]
    labels=[]
    polarities=[]
    for sentence, aspect_term, polarity in zip(data['sentence'],data['aspect_term'],data['polarity']):
        input_id = [tokenizer.token_to_id(word) if tokenizer.token_to_id(word) else 0  for word in sentence.split(' ') ]
        input_id= pad_and_truncate(input_id,seq_len, tokenizer.token_to_id('<pad>'))
        
        key_labels = aspect_term.split(' ')
        label=[]
        for word in sentence.split(' '):
            if word in key_labels:
                if word == key_labels[0]:
                    label.append(1)
                else:
                    label.append(2)
            else:
                label.append(0)
                
        
        label = pad_and_truncate(label, seq_len, -100)
        
        if polarity=='neural':
            key_polarity = 0
        elif polarity=='positive':
            key_polarity =1
        else:
            key_polarity =2
        
        polarity = [key_polarity if word in key_labels else -1 for word in sentence.split(' ')] 
        polarity = pad_and_truncate(polarity,seq_len,-100)
        input_ids.append(input_id)
        labels.append(label)
        polarities.append(polarity)
    
    return {
        'input_ids': torch.tensor(input_ids),
        'labels': torch.tensor(labels),
        'polarities': torch.tensor(polarities)    
            }
        
        
        
        
        
        
        

In [177]:
class ABSADataset(Dataset):
    def __init__(self, data, tokenizer, seq_len):
        temp= vectorize(data, tokenizer, seq_len)
        self.input_ids= temp['input_ids']
        self.labels = temp['labels']
        self.polarities= temp['polarities']
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx], self.polarities[idx]
    
    
    

In [178]:
train_ds = ABSADataset(data,tokenizer, seq_len)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [179]:
class ABSAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size,n_heads, num_layers, num_classes):
        super(ABSAModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Embedding(seq_len, embedding_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, dim_feedforward=1024, dropout=0.2, nhead=n_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.conv1d = nn.Conv1d(embedding_dim, 256, 3, 1,1)
        self.fc = nn.Sequential(
            nn.Linear(256,128),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(128,64),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(64,num_classes)
        )
        
    def forward(self, input_ids):
        positions= torch.arange(input_ids.size(1),device=input_ids.device).unsqueeze(0)
        embedded = self.embedding(input_ids)
        positional_encoding = self.positional_encoding(positions)
        inputs = embedded+ positional_encoding # NxLxD
        
        inputs = self.encoder(inputs)
        
        inputs = self.conv1d(inputs.permute(0,2,1)) # NxDxL
        inputs = inputs.permute(0,2,1)
        logits = self.fc(inputs) # NxLxD
        
        return logits.permute(0,2,1) #NxDxL 

In [180]:
id2label={
    0: '0',
    1: 'B-term',
    2: 'I-term'
}
label2id= {
    '0': 0,
    'B-term': 1,
    'I-term': 2
}


In [181]:
def compute_metric(preds,labels):
    true_predictions= [
        [id2label[p] for (p,l) in zip(pred,label) if l !=-100]
        for pred,label in zip(preds,labels)
    ]
    
    true_labels= [
        [id2label[l] for (p,l) in zip(pred,label) if l !=-100]
        for pred,label in zip(preds,labels)
    ]
    true_predictions_flat = [p for sublist in true_predictions for p in sublist]
    true_labels_flat = [l for sublist in true_labels for l in sublist]
    results = f1_score(true_labels_flat, true_predictions_flat, average="macro")  
    return results

In [182]:
def evaluate(model, dataloader, device):
    
    with torch.no_grad():
        model.eval()
        for idx, (input_ids, labels, polarities) in enumerate(dataloader):
            input_ids= input_ids.to(device)
            labels= labels.to(device)
            polarities=polarities.to(device)
            
            preds = model(input_ids) # NxDxL
            preds = preds.argmax(1) # NxL
            
            preds = preds.cpu().numpy().tolist()
            labels = labels.cpu().numpy().tolist()
            
            score= compute_metric(preds, labels)
            
            print(f'Score: {score:.4f} ')
        
            
                
                
                
                
                

In [183]:
def train(model, train_loader, test_loader, device,optimizer, criterion, epochs):
    total_losses=[]
    for epoch in tqdm(range(epochs),desc='Epoch'):
        epoch_loss=[]
        model.train()
        for idx, (input_ids, labels, polarities) in enumerate(tqdm(train_loader,desc='Training',leave=False)):
            optimizer.zero_grad()
            input_ids= input_ids.to(device)
            labels= labels.to(device)
            polarities=polarities.to(device)
            preds = model(input_ids)
            loss= criterion(preds, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss.append(loss)
        avg_loss = sum(epoch_loss)/len(epoch_loss)
        total_losses.append(avg_loss)
        print(f'Epoch {epoch+1}\t Training Loss: {avg_loss:.4f}')
    
        evaluate(model,test_loader,device)
    train_loss = sum(total_losses)/len(total_losses)
    
    print(f'Total Loss: {train_loss:.4f}')
    

In [184]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
model =ABSAModel(tokenizer.get_vocab_size(), 512, 256,4,4,3)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5)
criterion= nn.CrossEntropyLoss()



In [185]:
test_path = 'Dataset/laptops-trial.csv'
test_df = pd.read_csv(test_path)

In [186]:
test_df.Sentence= test_df.Sentence.apply(lambda x: text_normalize(x).lower())
test_df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2128,i liked the aluminum body,aluminum body,positive,12,25
1,81,lightweight and the screen is beautiful,screen,positive,20,26
2,353,from the build quality to the performance ever...,build quality,negative,9,22
3,353,from the build quality to the performance ever...,performance,negative,30,41
4,655,it was truly a great computer costing less tha...,costing,positive,30,37
5,2443,boots up fast and runs great,Boots up,positive,0,8
6,2443,boots up fast and runs great,runs,positive,18,22
7,764,call tech support standard email the form and ...,tech support,neutral,5,17
8,1479,the service i received from toshiba went above...,service,positive,4,11
9,2937,i would recommend it just because of the inter...,internet speed,positive,41,55


In [187]:
test_data={}
test_data['sentence']= test_df.Sentence
test_data['aspect_term']= test_df['Aspect Term']
test_data['polarity']= test_df['polarity']

In [188]:
test_ds = ABSADataset(test_data,tokenizer,seq_len)

In [189]:
test_loader = DataLoader(test_ds,batch_size=64, shuffle=False)

In [190]:
train(model,train_loader,test_loader,device,optimizer,criterion,20)

Epoch:   5%|▌         | 1/20 [00:08<02:48,  8.88s/it]

Epoch 1	 Training Loss: 0.6844
Score: 0.3168 


Epoch:  10%|█         | 2/20 [00:18<02:42,  9.02s/it]

Epoch 2	 Training Loss: 0.3390
Score: 0.3168 


Epoch:  15%|█▌        | 3/20 [00:26<02:29,  8.77s/it]

Epoch 3	 Training Loss: 0.3106
Score: 0.3168 


Epoch:  20%|██        | 4/20 [00:34<02:17,  8.62s/it]

Epoch 4	 Training Loss: 0.3069
Score: 0.3168 


Epoch:  25%|██▌       | 5/20 [00:43<02:09,  8.61s/it]

Epoch 5	 Training Loss: 0.3053
Score: 0.3168 


Epoch:  30%|███       | 6/20 [00:52<02:02,  8.74s/it]

Epoch 6	 Training Loss: 0.3036
Score: 0.3168 


Epoch:  35%|███▌      | 7/20 [01:01<01:56,  8.98s/it]

Epoch 7	 Training Loss: 0.3023
Score: 0.3168 


Epoch:  40%|████      | 8/20 [01:10<01:46,  8.85s/it]

Epoch 8	 Training Loss: 0.2979
Score: 0.3168 


Epoch:  45%|████▌     | 9/20 [01:19<01:36,  8.81s/it]

Epoch 9	 Training Loss: 0.2943
Score: 0.3168 


Epoch:  50%|█████     | 10/20 [01:27<01:26,  8.68s/it]

Epoch 10	 Training Loss: 0.2875
Score: 0.3168 


Epoch:  55%|█████▌    | 11/20 [01:36<01:18,  8.69s/it]

Epoch 11	 Training Loss: 0.2765
Score: 0.3168 


Epoch:  60%|██████    | 12/20 [01:44<01:08,  8.61s/it]

Epoch 12	 Training Loss: 0.2620
Score: 0.3168 


Epoch:  65%|██████▌   | 13/20 [01:53<01:00,  8.65s/it]

Epoch 13	 Training Loss: 0.2414
Score: 0.3463 


Epoch:  70%|███████   | 14/20 [02:02<00:52,  8.76s/it]

Epoch 14	 Training Loss: 0.2240
Score: 0.3844 


Epoch:  75%|███████▌  | 15/20 [02:10<00:43,  8.65s/it]

Epoch 15	 Training Loss: 0.2159
Score: 0.4708 


Epoch:  80%|████████  | 16/20 [02:19<00:34,  8.59s/it]

Epoch 16	 Training Loss: 0.2084
Score: 0.5194 


Epoch:  85%|████████▌ | 17/20 [02:28<00:26,  8.72s/it]

Epoch 17	 Training Loss: 0.2008
Score: 0.4744 


Epoch:  90%|█████████ | 18/20 [02:36<00:17,  8.65s/it]

Epoch 18	 Training Loss: 0.1968
Score: 0.5528 


Epoch:  95%|█████████▌| 19/20 [02:45<00:08,  8.55s/it]

Epoch 19	 Training Loss: 0.1935
Score: 0.5717 


Epoch: 100%|██████████| 20/20 [02:53<00:00,  8.69s/it]


Epoch 20	 Training Loss: 0.1872
Score: 0.6133 
Total Loss: 0.2819


# Aspect based sentiment analysis

In [None]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,i charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,i charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,the tech guy then said the service center does...,service center,negative,27,41
3,1316,the tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,the tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,we also use paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,we also use paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,how toshiba handles the repair seems to vary s...,repair,conflict,24,30
2356,848,how toshiba handles the repair seems to vary s...,repair,positive,130,136


In [None]:
a= data['sentence'][0]
b= data['aspect_term'][0]
a+' <sep> '+b

'i charge it at night and skip taking the cord with me because of the good battery life <sep> cord'

In [None]:
max_len_tags = max(len(text.split(' ')) for text in data['aspect_term'].values)
max_len_tags

6

In [None]:
seq_len+=max_len_tags


In [None]:
def vectorize(data, tokenizer, seq_len):
    input_ids=[]
    polarities=[]
    for sentence, aspect_term , polarity in zip(data['sentence'],data['aspect_term'] ,data['polarity']):
        sentence= sentence + ' <sep> ' + aspect_term
        
        input_id = [tokenizer.token_to_id(word) if tokenizer.token_to_id(word) else 0  for word in sentence.split(' ') ]
        input_id= pad_and_truncate(input_id,seq_len, tokenizer.token_to_id('<pad>'))
        
        if polarity=='neural':
            key_polarity = 0
        elif polarity=='positive':
            key_polarity =1
        else:
            key_polarity =2
        
        
        input_ids.append(input_id)
        polarities.append(key_polarity)
    
    return {
        'input_ids': torch.tensor(input_ids),
        'polarities': torch.tensor(polarities)    
            }
        
        
        
        
        
        
        

In [None]:
a = vectorize(data,tokenizer,seq_len)

In [87]:
class ABSADataset(Dataset):
    def __init__(self, data, tokenizer, seq_len):
        temp= vectorize(data, tokenizer, seq_len)
        self.input_ids= temp['input_ids']
        self.polarities= temp['polarities']
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.polarities[idx]
    
    
    

In [None]:
train_ds = ABSADataset(data, tokenizer, seq_len)
train_loader = DataLoader(train_ds,batch_size=32,shuffle=True)


In [None]:
class ABSAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size,n_heads, num_layers, num_classes):
        super(ABSAModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Embedding(seq_len, embedding_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, dim_feedforward=1024, dropout=0.2, nhead=n_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.conv1d = nn.Conv1d(embedding_dim, 256, 3, 1,1)
        
        self.fc = nn.Sequential(
            nn.Linear(256,128),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(128,64),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(64,num_classes)
        )
        
    def forward(self, input_ids):
        positions= torch.arange(input_ids.size(1),device=input_ids.device).unsqueeze(0)
        embedded = self.embedding(input_ids)
        positional_encoding = self.positional_encoding(positions)
        inputs = embedded+ positional_encoding # NxLxD
        
        inputs = self.encoder(inputs)
        
        inputs = self.conv1d(inputs.permute(0,2,1)) # NxDxL
        avg_pooling = nn.AvgPool1d(kernel_size=inputs.size(2))
        inputs= avg_pooling(inputs) #NxDx1
        inputs = inputs.squeeze(2) #NxD
        logits = self.fc(inputs) 
        
        return logits 

In [None]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
model =ABSAModel(tokenizer.get_vocab_size(), 512, 256,4,4,3)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5)
criterion= nn.CrossEntropyLoss()



In [None]:
def train(model, train_loader, device,optimizer, criterion, epochs):
    total_losses=[]
    for epoch in tqdm(range(epochs),desc='Epoch'):
        epoch_loss=[]
        model.train()
        for idx, (input_ids, polarities) in enumerate(tqdm(train_loader,desc='Training',leave=False)):
            optimizer.zero_grad()
            input_ids= input_ids.to(device)
            polarities=polarities.to(device)
            preds = model(input_ids)
            loss= criterion(preds, polarities)
            loss.backward()
            optimizer.step()
            
            epoch_loss.append(loss)
        avg_loss = sum(epoch_loss)/len(epoch_loss)
        total_losses.append(avg_loss)
        print(f'Epoch {epoch+1}\t Training Loss: {avg_loss:.4f}')
    train_loss = sum(total_losses)/len(total_losses)
    
    print(f'Total Loss: {train_loss:.4f}')
    

In [None]:
train(model, train_loader, device,optimizer, criterion,50)

Epoch:   2%|▏         | 1/50 [00:09<08:04,  9.90s/it]

Epoch 1	 Training Loss: 0.7006


Epoch:   4%|▍         | 2/50 [00:19<07:32,  9.43s/it]

Epoch 2	 Training Loss: 0.7035


Epoch:   6%|▌         | 3/50 [00:28<07:15,  9.26s/it]

Epoch 3	 Training Loss: 0.6989


Epoch:   8%|▊         | 4/50 [00:38<07:20,  9.57s/it]

Epoch 4	 Training Loss: 0.6949


Epoch:  10%|█         | 5/50 [00:47<07:12,  9.61s/it]

Epoch 5	 Training Loss: 0.6984


Epoch:  12%|█▏        | 6/50 [00:57<07:06,  9.68s/it]

Epoch 6	 Training Loss: 0.6899


Epoch:  14%|█▍        | 7/50 [01:07<06:56,  9.69s/it]

Epoch 7	 Training Loss: 0.6889


Epoch:  16%|█▌        | 8/50 [01:16<06:41,  9.56s/it]

Epoch 8	 Training Loss: 0.6833


Epoch:  18%|█▊        | 9/50 [01:25<06:28,  9.48s/it]

Epoch 9	 Training Loss: 0.6832


Epoch:  20%|██        | 10/50 [01:35<06:17,  9.44s/it]

Epoch 10	 Training Loss: 0.6777


Epoch:  22%|██▏       | 11/50 [01:44<06:09,  9.46s/it]

Epoch 11	 Training Loss: 0.6713


Epoch:  24%|██▍       | 12/50 [01:54<05:57,  9.41s/it]

Epoch 12	 Training Loss: 0.6562


Epoch:  26%|██▌       | 13/50 [02:03<05:43,  9.29s/it]

Epoch 13	 Training Loss: 0.6497


Epoch:  28%|██▊       | 14/50 [02:12<05:31,  9.22s/it]

Epoch 14	 Training Loss: 0.6451


Epoch:  30%|███       | 15/50 [02:21<05:24,  9.26s/it]

Epoch 15	 Training Loss: 0.6310


Epoch:  32%|███▏      | 16/50 [02:30<05:16,  9.32s/it]

Epoch 16	 Training Loss: 0.6156


Epoch:  34%|███▍      | 17/50 [02:41<05:17,  9.63s/it]

Epoch 17	 Training Loss: 0.6067


Epoch:  36%|███▌      | 18/50 [02:50<05:04,  9.52s/it]

Epoch 18	 Training Loss: 0.5902


Epoch:  38%|███▊      | 19/50 [02:59<04:51,  9.40s/it]

Epoch 19	 Training Loss: 0.5622


Epoch:  40%|████      | 20/50 [03:08<04:39,  9.31s/it]

Epoch 20	 Training Loss: 0.5398


Epoch:  42%|████▏     | 21/50 [03:17<04:27,  9.23s/it]

Epoch 21	 Training Loss: 0.5223


Epoch:  44%|████▍     | 22/50 [03:26<04:17,  9.21s/it]

Epoch 22	 Training Loss: 0.4955


Epoch:  46%|████▌     | 23/50 [03:36<04:09,  9.23s/it]

Epoch 23	 Training Loss: 0.4981


Epoch:  48%|████▊     | 24/50 [03:45<04:01,  9.30s/it]

Epoch 24	 Training Loss: 0.4911


Epoch:  50%|█████     | 25/50 [03:54<03:52,  9.29s/it]

Epoch 25	 Training Loss: 0.4825


Epoch:  52%|█████▏    | 26/50 [04:04<03:43,  9.29s/it]

Epoch 26	 Training Loss: 0.4886


Epoch:  54%|█████▍    | 27/50 [04:13<03:32,  9.24s/it]

Epoch 27	 Training Loss: 0.4648


Epoch:  56%|█████▌    | 28/50 [04:23<03:25,  9.36s/it]

Epoch 28	 Training Loss: 0.4613


Epoch:  58%|█████▊    | 29/50 [04:32<03:15,  9.30s/it]

Epoch 29	 Training Loss: 0.4519


Epoch:  60%|██████    | 30/50 [04:41<03:07,  9.37s/it]

Epoch 30	 Training Loss: 0.4496


Epoch:  62%|██████▏   | 31/50 [04:50<02:57,  9.32s/it]

Epoch 31	 Training Loss: 0.4432


Epoch:  64%|██████▍   | 32/50 [05:00<02:47,  9.29s/it]

Epoch 32	 Training Loss: 0.4433


Epoch:  66%|██████▌   | 33/50 [05:10<02:41,  9.49s/it]

Epoch 33	 Training Loss: 0.4471


Epoch:  68%|██████▊   | 34/50 [05:19<02:30,  9.44s/it]

Epoch 34	 Training Loss: 0.4240


Epoch:  70%|███████   | 35/50 [05:28<02:20,  9.33s/it]

Epoch 35	 Training Loss: 0.4265


Epoch:  72%|███████▏  | 36/50 [05:37<02:09,  9.27s/it]

Epoch 36	 Training Loss: 0.4296


Epoch:  74%|███████▍  | 37/50 [05:48<02:06,  9.74s/it]

Epoch 37	 Training Loss: 0.4228


Epoch:  76%|███████▌  | 38/50 [05:58<01:56,  9.71s/it]

Epoch 38	 Training Loss: 0.4143


Epoch:  78%|███████▊  | 39/50 [06:10<01:55, 10.48s/it]

Epoch 39	 Training Loss: 0.4135


Epoch:  80%|████████  | 40/50 [06:19<01:40, 10.08s/it]

Epoch 40	 Training Loss: 0.4017


Epoch:  82%|████████▏ | 41/50 [06:29<01:29, 10.00s/it]

Epoch 41	 Training Loss: 0.4070


Epoch:  84%|████████▍ | 42/50 [06:39<01:19, 10.00s/it]

Epoch 42	 Training Loss: 0.3971


Epoch:  86%|████████▌ | 43/50 [06:48<01:08,  9.78s/it]

Epoch 43	 Training Loss: 0.3877


Epoch:  88%|████████▊ | 44/50 [06:57<00:57,  9.63s/it]

Epoch 44	 Training Loss: 0.3893


Epoch:  90%|█████████ | 45/50 [07:07<00:48,  9.62s/it]

Epoch 45	 Training Loss: 0.3876


Epoch:  92%|█████████▏| 46/50 [07:17<00:38,  9.61s/it]

Epoch 46	 Training Loss: 0.3896


Epoch:  94%|█████████▍| 47/50 [07:26<00:28,  9.62s/it]

Epoch 47	 Training Loss: 0.3719


Epoch:  96%|█████████▌| 48/50 [07:35<00:19,  9.51s/it]

Epoch 48	 Training Loss: 0.3710


Epoch:  98%|█████████▊| 49/50 [07:45<00:09,  9.44s/it]

Epoch 49	 Training Loss: 0.3797


Epoch: 100%|██████████| 50/50 [07:55<00:00,  9.50s/it]


Epoch 50	 Training Loss: 0.3760
Total Loss: 0.5203


In [192]:
test_df_path = 'Dataset/Laptops_Test_Data_PhaseA.csv'
test_df= pd.read_csv(test_df_path)


In [193]:
test_df

Unnamed: 0,id,Sentence
0,892:1,"Boot time is super fast, around anywhere from ..."
1,1144:1,tech support would not fix the problem unless ...
2,805:2,but in resume this computer rocks!
3,359:1,Set up was easy.
4,562:1,Did not enjoy the new Windows 8 and touchscree...
...,...,...
795,256:1,This hardware seems to be better than the iMac...
796,246:1,I'm done with WinDoze computers.
797,520:1,I've had it for about 2 months now and found n...
798,306:2,the latest version does not have a disc drive.


In [194]:
sample= text_normalize(test_df['Sentence'][0])
sample

'Boot time is super fast around anywhere from 35 seconds to 1 minute'

In [195]:
seq_len1=77
seq_len2= 83

In [196]:
a= [tokenizer.token_to_id(token) if tokenizer.token_to_id(token) else 0 for token in sample.split(' ')]
sample_test = pad_and_truncate(a,seq_len1, tokenizer.token_to_id('<pad>'))

In [197]:
sample_test= torch.tensor(sample_test)

In [200]:
sample_test= sample_test.unsqueeze(0)

In [205]:
sample_test=sample_test.to(device)

In [206]:
pred = model(sample_test)
pred

tensor([[[ 4.1994,  2.2421,  3.5294,  2.7246,  3.0935,  3.3896,  2.9228,
           3.5497,  2.6947,  3.2060,  4.1110,  2.6970,  1.4901,  3.1806,
           4.2165,  4.3635,  4.3837,  4.1999,  4.0665,  3.9642,  4.1904,
           4.1758,  4.4522,  4.2857,  4.2000,  4.0089,  3.6598,  3.4549,
           3.9715,  4.1006,  3.7732,  4.0891,  4.2622,  4.6238,  4.2373,
           4.0179,  4.1496,  4.2162,  3.8610,  3.5792,  4.1347,  4.6534,
           4.0881,  3.5803,  4.2580,  3.8267,  3.9619,  4.0549,  4.1644,
           4.4701,  3.7900,  4.1543,  4.4493,  4.3665,  3.5797,  4.2836,
           3.8928,  3.4985,  3.8494,  4.0751,  4.1661,  3.8077,  4.0885,
           4.1319,  3.9510,  3.8705,  4.5993,  3.8970,  3.8329,  3.9439,
           4.2804,  4.0325,  3.9693,  4.0627,  3.9716,  4.5557,  3.3292],
         [-1.5875,  0.0946, -1.7099, -0.4711, -1.3097, -0.9472, -0.9353,
          -1.6119, -0.5708, -1.4030, -1.4700, -0.2178, -0.6922, -2.0489,
          -1.3337, -1.3308, -1.4849, -1.3740, -1.2

In [208]:
result=torch.argmax(pred,dim=1)
result

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]], device='cuda:0')

# BERT

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
b=tokenizer('im going to school','going')
c= tokenizer.convert_ids_to_tokens(b['input_ids'])
c

['[CLS]', 'im', 'going', 'to', 'school', '[SEP]', 'going', '[SEP]']

In [None]:
sample= data['sentence'][1]
c=sample.split(' ')
c

['i',
 'charge',
 'it',
 'at',
 'night',
 'and',
 'skip',
 'taking',
 'the',
 'cord',
 'with',
 'me',
 'because',
 'of',
 'the',
 'good',
 'battery',
 'life']

In [1]:
tokenizer(sample)

NameError: name 'tokenizer' is not defined

In [15]:
seq_len = max([len(tokenizer(text)['input_ids']) for text in data['sentence']])
seq_len


NameError: name 'data' is not defined

In [57]:
for text in data['sentence']:
    if len(tokenizer.tokenize(text))>80:
        print(tokenizer.tokenize(text))

['in', 'november', 'my', 'computer', 'messed', 'up', 'entirely', 'and', 'wouldn', '##t', 'power', 'on', 'after', 'int', '##all', '##ing', 'a', 'windows', 'update', 'i', 'had', 'to', 'have', 'my', 'hd', 'flashed', 'and', 'lost', 'everything', 'on', 'it', 'including', 'my', 'school', 'assignments', 'and', 'ir', '##rip', '##lace', '##able', 'pictures', 'that', 'were', 'only', 'in', 'digital', 'format', 'and', 'several', 'other', 'things', 'when', 'this', 'update', 'was', 'installed', 'for', 'some', 'reason', 'i', 'was', 'unable', 'to', 'roll', 'back', 'the', 'drivers', 'and', 'everything', 'to', 'an', 'earlier', 'working', 'condition', 'because', 'when', 'the', 'update', 'was', 'installed', 'it', 'deleted', 'my', 'history']
['in', 'november', 'my', 'computer', 'messed', 'up', 'entirely', 'and', 'wouldn', '##t', 'power', 'on', 'after', 'int', '##all', '##ing', 'a', 'windows', 'update', 'i', 'had', 'to', 'have', 'my', 'hd', 'flashed', 'and', 'lost', 'everything', 'on', 'it', 'including', 'm

In [30]:
tokenizer.tokenize('taking another')

['taking', 'another']

In [26]:
a=tokenizer.tokenize(c[7])
print(a)
b= tokenizer.convert_tokens_to_ids(a)
b

['taking']


[2635]

In [19]:
tokenizer.convert_tokens_to_ids('<pad>')

100

In [None]:

def term_bert_tokenize(data,tokenizer,seq_len):
    input_ids=[]
    labels=[]
    for sentence, aspect_term in zip(data['sentence'],data['aspect_term']):
        bert_tokens = []
        bert_tags = []
        key_aspect=tokenizer.tokenize(aspect_term)
        print(key_aspect)
        
        for word in sentence.split(' '):
            t = tokenizer.tokenize(word)
            bert_tokens += t
            
            if word in key_aspect:
                if word == key_aspect[0]:
                    bert_tags+= [1]*len(t)
                else:
                    bert_tags+= [2]*len(t)
            else:
                bert_tags+=[0]*len(t)
        bert_tokens = tokenizer.convert_tokens_to_ids(bert_tokens)
        bert_tokens = pad_and_truncate(bert_tokens,seq_len,tokenizer.convert_tokens_to_ids('<pad>'))
        bert_tags = pad_and_truncate(bert_tags,seq_len,-100)
        input_ids.append(bert_tokens)
        labels.append(bert_tags)
    return {
        'input_ids': torch.tensor(input_ids),
        'labels': torch.tensor(labels),  
            }
    

In [105]:
sample = {}
sample['sentence']= data['sentence'][0:4]
sample['aspect_term']= data['aspect_term'][0:4]

In [120]:
sample['sentence'][0]

'i charge it at night and skip taking the cord with me because of the good battery life '

In [108]:
a = term_bert_tokenize(data,tokenizer,100)

['cord']
['battery', 'life']
['service', 'center']
['sales', 'team']
['tech', 'guy']
['quality']
['gui']
['applications']
['use']
['start', 'up']
['features']
['ich', '##at']
['photo', '##boot', '##h']
['garage', 'band']
['features']
['gui']
['screen']
['power', 'light']
['hard', 'drive', 'light']
['battery']
['rubber', 'enclosure']
['edge']
['multi', 'touch', 'gestures']
['tracking', 'area']
['external', 'mouse']
['gaming']
['suite', 'of', 'software']
['speed']
['windows', '7']
['usb', 'devices']
['keyboard']
['software']
['system']
['microsoft', 'office', 'for', 'the', 'mac']
['sync', '##ing']
['30', 'hd', 'monitor']
['screen']
['boot', 'up']
['service']
['battery']
['operating', 'system']
['pre', '##loaded', 'software']
['price']
['features']
['clock', 'in', 'bio', '##s', 'setup']
['warrant', '##y', 'service']
['brand']
['warrant', '##y']
['features']
['fan']
['customer', 'service', 'number']
['warrant', '##y']
['talking', 'to', 'a', 'technician']
['hard', 'disc']
['windows']
['driv

In [123]:
tokenizer.tokenize('i charge it at night and skip taking the cord with me because of the good battery life ')

['i',
 'charge',
 'it',
 'at',
 'night',
 'and',
 'skip',
 'taking',
 'the',
 'cord',
 'with',
 'me',
 'because',
 'of',
 'the',
 'good',
 'battery',
 'life']

In [119]:
print(a['input_ids'][20])
print(a['labels'][20])

tensor([ 1999,  1996,  4497,  2122,  6097,  1001,  1001,  2808,  2024,  4372,
         1001,  1001,  2553,  2094,  1999,  1037,  3730,  8903, 17539,  2061,
         2017,  2097,  2196,  2113,  2055,  1996, 15082,  3341,  2127,  2017,
         4965,  2009,  2131,  2009,  2188,  3338,  1996,  7744,  1998,  2224,
         2009,  2200, 12266,  9530,   100,   100,   100,   100,   100,   100,
          100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
          100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
          100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
          100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
          100,   100,   100,   100,   100,   100,   100,   100,   100,   100])
tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    1,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    

In [69]:
data['aspect_term'][3]

'"sales" team'

In [79]:
a =tokenizer.tokenize('I need "sales" team' )

In [80]:
tokenizer.convert_tokens_to_ids(a)

[1045, 2342, 1000, 4341, 1000, 2136]

In [75]:
b= tokenizer.convert_ids_to_tokens([1996, 6627, 3124, 2059, 2056, 1996, 2326, 2415, 2515, 2025, 2079, 1015,
        3406, 2487, 3863, 1998, 1045, 2031, 2000, 3622, 2026, 5142, 2000, 1996,
        4341, 2136, 2029, 2003, 1996, 7027, 4497, 2029, 1045, 4149, 2026, 5658,
        8654, 2013])
print(len(b))
' '.join(b)

38


'the tech guy then said the service center does not do 1 ##to ##1 exchange and i have to direct my concern to the sales team which is the retail shop which i bought my net ##book from'

In [None]:
a

In [18]:
data['sentence'][0]

'i charge it at night and skip taking the cord with me because of the good battery life'

In [16]:
a['labels'][0]

tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,    0,
           0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100])

In [58]:
id2label={
    0: '0',
    1: 'B-term',
    2: 'I-term'
}
label2id= {
    '0': 0,
    'B-term': 1,
    'I-term': 2
}


In [4]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
bert = AutoModelForTokenClassification.from_pretrained('distilbert/distilbert-base-uncased') 

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
a=list(model.children())
a

[DistilBertModel(
   (embeddings): Embeddings(
     (word_embeddings): Embedding(30522, 768, padding_idx=0)
     (position_embeddings): Embedding(512, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (transformer): Transformer(
     (layer): ModuleList(
       (0-5): 6 x TransformerBlock(
         (attention): DistilBertSdpaAttention(
           (dropout): Dropout(p=0.1, inplace=False)
           (q_lin): Linear(in_features=768, out_features=768, bias=True)
           (k_lin): Linear(in_features=768, out_features=768, bias=True)
           (v_lin): Linear(in_features=768, out_features=768, bias=True)
           (out_lin): Linear(in_features=768, out_features=768, bias=True)
         )
         (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         (ffn): FFN(
           (dropout): Dropout(p=0.1, inplace=False)
           (lin1): Linear(in_features=768, out_features=3072, bias=True

In [79]:
backbone = nn.Sequential(*list(model.children()))[:-2]
backbone

Sequential(
  (0): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [None]:
class Term_BertModel(nn.Module):
    def __init__(self, bert):
        super(ABSAModel,self).__init__()
        self.backbone = nn.Sequential(*list(bert.children()))[:-2]
        self.fc = nn.Sequential(
            nn.Linear(768,512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(512,256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256,3)
        )

    def forward(self, input):
        input = self.backbone(input).last_hidden_state
        logits = self.fc(input)
        
        return logits.permute(0,2,1)
        

In [None]:
class ABSADataset(Dataset):
    def __init__(self, data, tokenize, tokenizer, seq_len):
        temp= tokenize(data, tokenizer, seq_len)
        self.input_ids= temp['input_ids']
        self.labels = temp['labels']
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]
    
    
    

In [98]:
train_ds = ABSADataset(data,tokenizer,seq_len)

In [99]:
train_loader = DataLoader(train_ds, batch_size=32,shuffle=True)

In [105]:
def train(model, train_loader, device,optimizer, criterion, epochs):
    total_losses=[]
    for epoch in tqdm(range(epochs),desc='Epoch'):
        epoch_loss=[]
        model.train()
        for idx, (input_ids, labels) in enumerate(tqdm(train_loader,desc='Training',leave=False)):
            optimizer.zero_grad()
            input_ids= input_ids.to(device)
            labels= labels.to(device)
            preds = model(input_ids)
            loss= criterion(preds, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss.append(loss)
        avg_loss = sum(epoch_loss)/len(epoch_loss)
        total_losses.append(avg_loss)
        print(f'Epoch {epoch+1}\t Training Loss: {avg_loss:.4f}')
    train_loss = sum(total_losses)/len(total_losses)
    
    print(f'Total Loss: {train_loss:.4f}')
    

In [139]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'

model =ABSAModel(bert)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5)
criterion= nn.CrossEntropyLoss()

In [None]:
train(model, train_loader,device,optimizer,criterion,10)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

In [45]:
tokenizer('im going to school','going',truncation=True, padding=True, return_tensors='pt')

{'input_ids': tensor([[  101, 10047,  2183,  2000,  2082,   102,  2183,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [51]:
seq_len = max([len(tokenizer(sentence, aspect_term)['input_ids']) for sentence,aspect_term in zip(data['sentence'],data['aspect_term'])])

In [52]:
seq_len

89

100

In [58]:
dt = sentiment_bert_tokenize(data,tokenizer,seq_len)

In [1]:
import torch
import torch.nn as nn

In [9]:
class Sentiment_BertModel(nn.Module):
    def __init__(self, bert):
        super(Sentiment_BertModel,self).__init__()
        self.backbone = nn.Sequential(*list(bert.children()))[:-2]
        self.fc = nn.Sequential(
            nn.Linear(768,512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(512,256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256,3)
        )

    def forward(self, input):
        input = self.backbone(input).last_hidden_state[:,0,:].squeeze(1)
        logits = self.fc(input)
        return logits.permute(0,2,1) #NxCxL
        

In [10]:
model =Sentiment_BertModel(bert)

In [11]:
model.load_state_dict(torch.load('checkpoints/sentiment_model.pth'))

<All keys matched successfully>

In [4]:
sentence= torch.tensor([0,2,3,4])
a= sentence.squeeze(0)
a

tensor([0, 2, 3, 4])

In [1]:
import torch

print(torch.cuda.is_available())

False
