In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
import numpy as np

import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from keras.preprocessing.sequence import pad_sequences

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/revanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/revanth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/revanth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def process_sentence(text):
    
    text = word_tokenize(text) # tokenize words in text
    text = [re.sub('[^A-Za-z]+', '', word) for word in text] # this line substitutes any white space before the word by removing the space
    text = [word.lower() for word in text if word.isalpha()] # lower each word in text
    text = [WordNetLemmatizer().lemmatize(word) for word in text] # lemmatization of words, so when see persons an person, both are dealt as one word person
    text = ' '.join(text) # join words into text again
    return text

In [3]:
def get_data(path):
    x_data = pd.read_json(path)
    x_data['sentence'] = x_data['sentence'].apply(process_sentence)
    y_data = pd.factorize(x_data['sentiment'])[0]
    x_data. drop('sentiment', axis=1, inplace=True)
    
    return x_data, y_data

In [4]:
x_train, y_train = get_data('./acsa_hard_train.json')
x_test, y_test = get_data('./acsa_hard_test.json')

In [5]:
MAX_VOCAB = 5000

tokenizer = Tokenizer(num_words=MAX_VOCAB)
aspect_tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(np.concatenate([x_train.sentence.to_numpy(), x_test.sentence.to_numpy()], axis=0))
aspect_tokenizer.fit_on_texts(np.concatenate([x_train.aspect.to_numpy(), x_test.aspect.to_numpy()], axis=0))

words_to_index = tokenizer.word_index
len(words_to_index)

992

In [6]:
emmbed_dict = {}
with open('glove.6B.50d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float32')
        emmbed_dict[word]=vector

maxSenLen = 150

In [7]:
INPUT_DIM = len(words_to_index) + 1
ASPECT_INPUT_DIM = len(aspect_tokenizer.word_index) + 1
EMBEDDING_DIM = emmbed_dict['apple'].shape[0]

emb_matrix = np.zeros((INPUT_DIM, EMBEDDING_DIM))
aspect_emb_matrix = np.zeros((ASPECT_INPUT_DIM, EMBEDDING_DIM))

for word, index in words_to_index.items():
    
    emb_vector = emmbed_dict.get(word)
    if emb_vector is not None:
        emb_matrix[index] = emb_vector
        
for word, index in aspect_tokenizer.word_index.items():
    
    emb_vector = emmbed_dict.get(word)
    if emb_vector is not None:
        aspect_emb_matrix[index] = emb_vector

In [8]:
def fa(x_data):
    feature = tokenizer.texts_to_sequences(x_data.sentence)
    feature = pad_sequences(feature, maxlen=maxSenLen, padding='post')
    feature = np.array([np.array(x) for x in feature])

    aspect = aspect_tokenizer.texts_to_sequences(x_data.aspect)
    aspect = pad_sequences(aspect, maxlen=1, padding='post')
    aspect = np.array([np.array(x) for x in aspect])
    
    return feature, aspect

feature_train, aspect_train = fa(x_train)
feature_test, aspect_test = fa(x_test)

In [9]:
class CNN(nn.Module):
    def __init__(self, vocab_size, aspect_vocab_size, embedding_dim, n_filters, filter_size, n_classes):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(emb_matrix, device="cpu"), requires_grad=True)
        
        self.aspect_embedding = nn.Embedding(aspect_vocab_size, embedding_dim)
        self.aspect_embedding.weight = nn.Parameter(torch.tensor(aspect_emb_matrix, device="cpu"), requires_grad=True)
                
        self.convs1 = nn.Conv1d(in_channels = embedding_dim, out_channels = n_filters, kernel_size = filter_size)
        self.convs2 = nn.Conv1d(in_channels = embedding_dim, out_channels = n_filters, kernel_size = filter_size)
        
        self.fc = nn.Linear(n_filters, n_classes)
        self.aspect_fc = nn.Linear(embedding_dim, n_filters)
                
    def forward(self, text, aspect):
        
        
        embedded = self.embedding(text)
        aspect_embedded = self.aspect_embedding(aspect).squeeze(1)
                        
        embedded = embedded.permute(0, 2, 1).float()
        aspect_embedded = aspect_embedded.float()
                
        x = F.tanh(self.convs1(embedded))
        y = F.relu(self.convs2(embedded) + self.aspect_fc(aspect_embedded).unsqueeze(2))
        x = x * y
            
        pooled = F.max_pool1d(x, x.shape[2]).squeeze(2)
            
        return self.fc(pooled)

In [10]:
N_FILTERS = 10
FILTER_SIZE = 3
N_CLASSES = 4

model = CNN(INPUT_DIM, ASPECT_INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZE, N_CLASSES)

In [11]:
device = torch.device('cpu')

In [12]:
optimizer = optim.Adam(model.parameters())

model = model.to(device)

In [13]:
def accuracy(preds, y):
    correct = (preds.argmax(1) == y).sum()
    acc = correct.sum() / len(y)
    return acc

In [14]:
def train(model, feature, aspect, optimizer):
    
    total_loss = 0
    total_acc = 0
    
    model.train()
    
    for i in range(len(feature)):
        
        optimizer.zero_grad()

        ftr = torch.tensor([feature[i]])
        ftr = ftr.to(device)
        asp = torch.tensor([aspect[i]])
        asp = asp.to(device)

        predictions = model(ftr, asp).squeeze(1)

        loss = F.cross_entropy(predictions, torch.tensor(y_train[i], device="cpu").view(-1))

        acc = accuracy(predictions, torch.tensor(y_train[i], device="cpu").view(-1))

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        total_acc += acc.item()
        
    return total_loss / len(feature), total_acc / len(feature)

In [15]:
N_ITERS = 8
for i in range(N_ITERS):
    train_loss, train_acc = train(model, feature_train, aspect_train, optimizer)

  ftr = torch.tensor([feature[i]])


In [16]:
print(train_loss, train_acc)

0.5894055972226627 0.7643835616438356


In [17]:
def eval(model, feature, aspect):
    
    total_loss = 0
    total_acc = 0
    
    model.eval()
    
    for i in range(len(feature)):
                
        ftr = torch.tensor([feature[i]])
        ftr = ftr.to(device)
        asp = torch.tensor([aspect[i]])
        asp = asp.to(device)
        
        predictions = model(ftr, asp).squeeze(1)
        
        loss = F.cross_entropy(predictions, torch.tensor(y_test[i], device="cpu").view(-1))
        
        acc = accuracy(predictions, torch.tensor(y_test[i], device="cpu").view(-1))
        
        total_loss += loss.item()
        total_acc += acc.item()
        
    return total_loss / len(feature), total_acc / len(feature)

In [18]:
test_loss, test_acc = eval(model, feature_test, aspect_test)

In [19]:
print(test_loss, test_acc)

2.0135337129402697 0.1797752808988764
