In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.models as models
from transformers import BertTokenizer, BertModel

import pandas as pd 
import numpy as np
import re
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [2]:
pretrained_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_basic_tokenize=True)
pretrained_bert = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)
pretrained_bert.eval()
print("Bert Loaded !")

Bert Loaded !


In [3]:
def get_word_embeddings(current_index,tokens):
    
    marked_text = None
    
    if(current_index != 0 and current_index != 1 and current_index != len(tokens)-2 and current_index !=len(tokens)-1):
        marked_text = tokens[current_index-2:current_index+3]
    
    else:
        if(current_index == 0):
            marked_text = tokens[current_index:current_index+3] 
            marked_text = ["CLS"] + ["CLS"] + marked_text

        elif(current_index == 1):
            marked_text = tokens[current_index-1:current_index+3] 
            marked_text = ["CLS"] + marked_text

        if(current_index == len(tokens)-2):
            marked_text = tokens[current_index-2:current_index+2] 
            marked_text = marked_text + ["SEP"]
            
        elif(current_index == len(tokens)-1):
            marked_text = tokens[current_index-2:current_index+2] 
            marked_text = marked_text + ["SEP"] + ["SEP"]

    marked_text = ' '.join(marked_text)
        
    # Split the sentence into tokens.
    tokenized_text = pretrained_tokenizer.basic_tokenizer.tokenize(marked_text)
    
    # Map the token strings to their vocabulary indices.
    indexed_tokens = pretrained_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    
    with torch.no_grad():
        outputs = pretrained_bert(tokens_tensor)
        
        # can use last hidden state as word embeddings
        last_hidden_state = outputs[0]
        word_embed_1 = last_hidden_state
        
    return word_embed_1

In [4]:
def get_correct_word_labels(sentence_tokens,word_sentiments):
    
    word_sentiments = word_sentiments.replace('[','')
    word_sentiments = word_sentiments.replace(']','')
    word_sentiments = word_sentiments.replace("'",'')

    word_sentiments = word_sentiments.split(', ')

    final_labels = [float(x) for x in word_sentiments]
    final_labels
    
    sentence_tokens = sentence_tokens.replace('[','')
    sentence_tokens = sentence_tokens.replace(']','')
    sentence_tokens = sentence_tokens.replace("'",'')
    
    sentence_tokens = sentence_tokens.split(', ')
    
    return sentence_tokens, final_labels

In [5]:
class Custom_Dataset(Dataset):

    def __init__(self, csv_file, train = True, val = False):
        
        self.df = pd.read_csv(csv_file, index_col = 0)
        
        if(val):
            self.df = self.df[self.df['split']=='val']
            
        elif(train):
            self.df = self.df[self.df['split']=='train']
       
        else:
            self.df = self.df[self.df['split']=='test']
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        text_tokens = self.df.iloc[index]['tokens']
        sentiments = self.df.iloc[index]['word_sentiment']
        
        return text_tokens, sentiments

In [2]:
class NeuralNetwork(nn.Module):

    def __init__(self, text_input = 3840, output_num = 1,device="cpu"):
        super().__init__()
        self.device = device
            
        self.layer_1 = nn.Linear(in_features = text_input, out_features = 2000, device=device)
        self.layer_2 = nn.Linear(in_features = 2000, out_features = 1450, device=device)
        self.layer_3 = nn.Linear(in_features = 1450, out_features = 975, device=device)
        self.layer_4 = nn.Linear(in_features = 975, out_features = 130, device=device)
        self.layer_5 = nn.Linear(in_features = 130, out_features = 10, device=device)
        self.layer_6 = nn.Linear(in_features = 10, out_features = output_num, device=device)
        
        self.act_fn_relu = nn.ReLU()
        self.act_fn_sigmoid = nn.Sigmoid()
        self.Dp = nn.Dropout()


    def forward(self, x):
        
        x = self.layer_1(x)
        x = self.act_fn_relu(x)
        
        x = self.layer_2(x)
        x = self.act_fn_relu(x)
        
        x = self.layer_3(x)
        x = self.act_fn_relu(x)
        
        x = self.Dp(x)
        
        x = self.layer_4(x)
        x = self.act_fn_relu(x)
        
        x = self.layer_5(x)
        x = self.act_fn_relu(x)
        
        x = self.layer_6(x)
        output = self.act_fn_sigmoid(x)
        
        return output

In [7]:
training_data = Custom_Dataset(csv_file='./clean.csv',train = True)
# testing_data = Custom_Dataset(csv_file='./clean.csv',train = False)

In [11]:
train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
# test_dataloader = DataLoader(testing_data, batch_size=32, shuffle=True)

In [3]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

network = NeuralNetwork(3840,1,device)
# network.load_state_dict(torch.load('./word_level_model.pt'))
# network.eval()

In [4]:
print(network)

NeuralNetwork(
  (layer_1): Linear(in_features=3840, out_features=2000, bias=True)
  (layer_2): Linear(in_features=2000, out_features=1450, bias=True)
  (layer_3): Linear(in_features=1450, out_features=975, bias=True)
  (layer_4): Linear(in_features=975, out_features=130, bias=True)
  (layer_5): Linear(in_features=130, out_features=10, bias=True)
  (layer_6): Linear(in_features=10, out_features=1, bias=True)
  (act_fn_relu): ReLU()
  (act_fn_sigmoid): Sigmoid()
  (Dp): Dropout(p=0.5, inplace=False)
)


In [5]:
for name, param in network.named_parameters():
    print(f"Parameter {name}, shape {param.shape}")

Parameter layer_1.weight, shape torch.Size([2000, 3840])
Parameter layer_1.bias, shape torch.Size([2000])
Parameter layer_2.weight, shape torch.Size([1450, 2000])
Parameter layer_2.bias, shape torch.Size([1450])
Parameter layer_3.weight, shape torch.Size([975, 1450])
Parameter layer_3.bias, shape torch.Size([975])
Parameter layer_4.weight, shape torch.Size([130, 975])
Parameter layer_4.bias, shape torch.Size([130])
Parameter layer_5.weight, shape torch.Size([10, 130])
Parameter layer_5.bias, shape torch.Size([10])
Parameter layer_6.weight, shape torch.Size([1, 10])
Parameter layer_6.bias, shape torch.Size([1])


In [15]:
optimizer = torch.optim.SGD(network.parameters(),lr=0.001)
loss_fn = nn.BCELoss()

In [22]:
epoch_loss = 0
word_count = 0
for epoch in range(0,30):
    
    text_batch , target_batch = next(iter(train_dataloader))

    for text,target in zip(text_batch,target_batch):
        text,target = get_correct_word_labels(text,target)
        
        for index,label in zip(range(len(text)),target):

            embedding = get_word_embeddings(index,text)
            label = torch.tensor([label])

            embedding = embedding.reshape(-1)
            out = network(embedding)

            loss = loss_fn(out, label)
            
            epoch_loss += loss.item()
            word_count += 1
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    print(f'Finished epoch: {epoch}, latest loss: {epoch_loss}')
    print(f'Finished epoch: {epoch}, average loss: {epoch_loss/word_count}')
    epoch_loss = 0
    word_count = 0

Finished epoch: 0, latest loss: 51.437211483736974
Finished epoch: 0, average loss: 0.13827207388101337
Finished epoch: 1, latest loss: 49.90580752115932
Finished epoch: 1, average loss: 0.13098637144661238
Finished epoch: 2, latest loss: 66.71981935244844
Finished epoch: 2, average loss: 0.16848439230416273
Finished epoch: 3, latest loss: 37.255266746244615
Finished epoch: 3, average loss: 0.09601872872743457
Finished epoch: 4, latest loss: 38.653017963035154
Finished epoch: 4, average loss: 0.10532157483115846
Finished epoch: 5, latest loss: 41.16056511110841
Finished epoch: 5, average loss: 0.10315931105540956
Finished epoch: 6, latest loss: 57.610594119637845
Finished epoch: 6, average loss: 0.14734167294025025
Finished epoch: 7, latest loss: 55.18556358490639
Finished epoch: 7, average loss: 0.13761985931398102
Finished epoch: 8, latest loss: 56.10813182352189
Finished epoch: 8, average loss: 0.13422998043904757
Finished epoch: 9, latest loss: 44.93336822231447
Finished epoch: 9, 

In [13]:
predicted = []   
gt = []

for x in range(len(testing_data)):
    
    text, target = testing_data[x]
    text,target = get_correct_word_labels(text,target)
       
    for index,label in zip(range(len(text)),target):
            
        embedding = get_word_embeddings(index,text)
        label = torch.tensor([label])

        embedding = embedding.reshape(-1)
        out = network(embedding)
    
        if(out.item() >= 0.5):
            predicted.append(1.0)
        else:
            predicted.append(0.0)
        
        gt.append(label.item())
    
    if(x%1000 == 0):
        print(f'Sentence no {x+1}')

Sentence no 1
Sentence no 1001
Sentence no 2001
Sentence no 3001
Sentence no 4001
Sentence no 5001
Sentence no 6001
Sentence no 7001
Sentence no 8001
Sentence no 9001
Sentence no 10001
Sentence no 11001
Sentence no 12001
Sentence no 13001
Sentence no 14001
Sentence no 15001


In [16]:
print(f'ACCURACY: {accuracy_score(gt,predicted)}')
print(f'PRECISION: {precision_score(gt,predicted)}')
print(f'RECALL: {recall_score(gt,predicted)}')
print(f'F1 SCORE: {f1_score(gt,predicted)}')

ACCURACY: 0.955380779018623
PRECISION: 0.8841546665292769
RECALL: 0.890578375152443
F1 SCORE: 0.8873548954212881


In [23]:
# torch.save(network.state_dict(), "v5.pt")

In [13]:
predicted = []   
gt = []

for epoch in range(0,10):
    
    text_batch , target_batch = next(iter(train_dataloader))
    
    for text,target in zip(text_batch,target_batch):
        text,target = get_correct_word_labels(text,target)
        
        for index,label in zip(range(len(text)),target):

            embedding = get_word_embeddings(index,text)
            label = torch.tensor([label])

            embedding = embedding.reshape(-1)
            out = network(embedding)
            
            if(out.item() >= 0.5):
                predicted.append(1.0)
            else:
                predicted.append(0.0)

            gt.append(label.item())
    
    
    print(f'Epoch no {epoch+1}')

Epoch no 1
Epoch no 2
Epoch no 3
Epoch no 4
Epoch no 5
Epoch no 6
Epoch no 7
Epoch no 8
Epoch no 9
Epoch no 10


In [14]:
print(f'ACCURACY: {accuracy_score(gt,predicted)}')
print(f'PRECISION: {precision_score(gt,predicted)}')
print(f'RECALL: {recall_score(gt,predicted)}')
print(f'F1 SCORE: {f1_score(gt,predicted)}')

ACCURACY: 0.9557288225188317
PRECISION: 0.9011857707509882
RECALL: 0.8808757244043787
F1 SCORE: 0.8909150113969392
