## Part 1

In [1]:
import numpy as np
from pprint import pprint

In [2]:
def read_train_file(filename):
    with open(filename, encoding='utf-8') as f:
        file_content = f.read()

    # Split the entire file into sentences. Output: List of sentences
    sentences = file_content.strip().split('\n\n')

    # Split each sentence into their token_tag pair
    # Output: List of sentences. Each sentence is a list of token_tag_pair
    token_tag_pairs = [i.split('\n') for i in sentences]

    # Separate each token_tag_pair into a list of [token, tag].
    # Output: [[[token, tag], [token, tag], ...], [[token, tag], [token, tag], ...], ...]
    for idx, sentence in enumerate(token_tag_pairs):
        token_tags = [i.rsplit(' ', maxsplit=1) for i in sentence]
        token_tag_pairs[idx] = token_tags

    return token_tag_pairs

In [3]:
train_dataset = './dataset/train'

In [4]:
train_dataset = read_train_file(train_dataset)

In [5]:
emission_count = {}
transition_count = {}
state_count = {}
possible_states = []

transition_count['START'] = {}
for sentence in train_dataset:
    prev_state = None
    
    for token, tag in sentence:
        if tag not in possible_states:
            possible_states.append(tag)

        if emission_count.get(token) == None:
            emission_count[token] = {}
        
        emission_count[token][tag] = emission_count[token].get(tag, 0) + 1

        if prev_state != None:
            if transition_count.get(prev_state) == None:
                transition_count[prev_state] = {}
            transition_count[prev_state][tag] = transition_count[prev_state].get(tag, 0) + 1

        else:
            transition_count['START'][tag] = transition_count['START'].get(tag, 0) + 1
            state_count['START'] = state_count.get('START', 0) + 1

        state_count[tag] = state_count.get(tag, 0) + 1
        prev_state = tag

    transition_count[prev_state]['STOP'] = transition_count[prev_state].get('STOP', 0) + 1

In [6]:
f = {}

for token, tags in emission_count.items():
    for tag, e_count in tags.items():
        key = "emission: " + tag + '+' + token
        e_prob = np.log(e_count/state_count[tag])
        f[key] = e_prob

for prev_tag, next_tags in transition_count.items():
    for next_tag, t_count in next_tags.items():
        key = "transition: " + prev_tag + '+' + next_tag
        t_prob = np.log(t_count/state_count[prev_tag])
        f[key] = t_prob

In [7]:
for key in f.keys():
    if key.startswith('transition:'):
        print(key, f[key])

transition: START+O -0.06165959394788234
transition: START+B-negative -4.539564428890097
transition: START+B-positive -3.153270067770207
transition: START+B-neutral -5.0503900526560885
transition: O+O -0.1469289780609056
transition: O+B-positive -3.100585479223317
transition: O+B-negative -4.235764913309846
transition: O+STOP -2.5966054955940074
transition: O+B-neutral -5.94191062981491
transition: B-positive+O -0.3572728512809094
transition: B-positive+I-positive -1.2540189870820944
transition: B-positive+STOP -4.252688120309395
transition: B-positive+B-positive -7.0859014643656115
transition: B-negative+O -0.2344506222289012
transition: B-negative+I-negative -1.6041608553332567
transition: B-negative+STOP -4.836281906951478
transition: I-positive+O -0.5419771288708248
transition: I-positive+I-positive -0.8965221465517323
transition: I-positive+STOP -4.564348191467836
transition: B-neutral+O -0.23293155768037255
transition: B-neutral+I-neutral -1.7788560643921472
transition: B-neutral

## Part 2

In [8]:
'''
calculate_score(x,y):
Helps to calulate the score for a given pair of input and output sequence pair (x,y)
Based on 2 features, emission and transition

Parameters:
x: List of tokens, e.g. x = x1, x2, ..., xn     Type: list[str]
y: List of tokens, e.g. y = y1, y2, ..., yn     Type: list[str]
f: Dictionary of feature weights                   Type: Dict{features: weights}
'''

def calculate_score(x,y,f):
    assert len(x) == len(y)

    feature_count = {}

    prev_tag = 'START'
    score = 0

    length = len(x)
    for i in range(length):
        e_key = "emission: " + y[i] + '+' + x[i]
        t_key = "transition: " + prev_tag + '+' + y[i]

        if e_key in f.keys():
            feature_count[e_key] = feature_count.get(e_key, 0) + 1

        if t_key in f.keys():
            feature_count[t_key] = feature_count.get(t_key, 0) + 1

        prev_tag = y[i]
        
    t_key = "transition: " + prev_tag + '+' + 'STOP'
    if t_key in f.keys():
            feature_count[t_key] = feature_count.get(t_key, 0) + 1

    for feature, count in feature_count.items():
        score += f[feature] * count

    return score

In [9]:
def viterbi(sentence, f):
        # BASE CASE
        scores = {
            0: {
                'START' : 0
            }
        }

        index = 1

        # Forward Algorithm - From START to index N
        for token in sentence:
            scores[index] = {}

            for state in possible_states:
                state_scores = {}

                for prev_tag in scores[index-1].keys():
                    e_key = "emission: " + state + '+' + token
                    t_key = "transition: " + prev_tag + '+' + token
                    e_prob = f.get(e_key, float('-inf'))
                    t_prob = f.get(t_key, float('-inf'))

                    # t_prob = self._calculate_transition_MLE(prev_tag, state)
                    # e_prob = self._calculate_emission_MLE_UNK(token, state)

                    if t_prob != float('-inf') and e_prob != float('-inf'):
                        state_scores[prev_tag] = \
                            scores[index-1][prev_tag] + \
                            t_prob + \
                            e_prob
                    else:
                        state_scores[prev_tag] = float('-inf')

                best_score = max(state_scores.values())
                scores[index][state] = best_score

            index += 1

        # Forward Algorithm - From index N to STOP
        state_scores = {}
        for prev_tag in scores[index-1].keys():
            t_key = "transition: " + prev_tag + '+' + 'STOP'
            t_prob = f.get(t_key, 0)
            if t_prob > 0:
                state_scores[prev_tag] = scores[index-1][prev_tag] + np.log(t_prob)
            else:
                state_scores[prev_tag] = float('-inf')

        y_n = max(state_scores, key=state_scores.get)
        prediction_reversed = [y_n]

        # Backtracking Algorithm
        for n in reversed(range(1,index)):
            state_scores = {}

            for state in scores[n-1].keys():
                t_key = "transition: " + state + '+' + prediction_reversed[-1]
                t_prob = f.get(t_key, 0)
                
                # t_prob = self._calculate_transition_MLE(state, prediction_reversed[-1])

                if t_prob > 0:
                    state_scores[state] = scores[n-1][state] + np.log(t_prob)

            if all(prob == float('-inf') for prob in state_scores.values()):
                prediction_reversed.append('O')
            else:
                best_state = max(state_scores, key=state_scores.get)
                prediction_reversed.append(best_state)

        prediction = []
        prediction_reversed.reverse()

        for idx, token in enumerate(sentence):
            prediction.append([token, prediction_reversed[idx+1]])

        return prediction

In [10]:
viterbi(["Loved","it"], f)

[['Loved', 'O'], ['it', 'O']]

## Part 6

In [None]:
def get_feature_count(train_dataset):
    feature_counts = {}
    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]

        n = len(x)

        for i in range(n):
            e_key1 = "emission: " + y[i] + "+" + x[i]
            e_key2 = "emission: " + y[i] + "+" + x[i-1]
            e_key3 = "emission: " + y[i] + "+" + x[i+1]
            t_key1 = "transition: " + y[i-1] + y[i]
            combined_key1 = "transition: " + y[i-1] + y[i] + x[i]

In [11]:
# List of possible features: 
# triple transitions: y + y + y

def triple_transition_counts(train_dataset):
    triple_transition_count = {}
    for sentence in train_dataset:
        for i in range(1, len(sentence) - 1):
            start_token = sentence[i-1][1]
            mid_token = sentence[i][1]
            end_token = sentence[i+1][1]

            triple_transition_count[start_token] = triple_transition_count.get(start_token, {})
            triple_transition_count[start_token][mid_token] = triple_transition_count[start_token].get(mid_token, {})
            triple_transition_count[start_token][mid_token][end_token] = triple_transition_count[start_token][mid_token].get(end_token, 0) + 1

    return triple_transition_count

def triple_transition_probabilities(triple_transition_counts, transition_count, f):
    for start_token, mid_tokens_counts in triple_transition_counts.items():
        for mid_token, end_tokens_counts in mid_tokens_counts.items():
            for end_token, tt_count in end_tokens_counts.items():
                tt_key = "triple_transition: " + start_token + '+' + mid_token + '+' + end_token
                t_key = "transition: " + start_token + '+' + mid_token
                f[tt_key] = tt_count / transition_count[t_key]

In [12]:
pprint(triple_transition_counts([train_dataset[0]]))


{'B-negative': {'O': {'O': 1}},
 'B-positive': {'O': {'O': 1}},
 'O': {'B-negative': {'O': 1},
       'B-positive': {'O': 1},
       'O': {'B-negative': 1, 'B-positive': 1, 'O': 7}}}


In [13]:
'''
calculate_score_part6(x,y):
Helps to calulate the score for a given pair of input and output sequence pair (x,y)
Based on 2 features, emission and transition

Parameters:
x: List of tokens, e.g. x = x1, x2, ..., xn     Type: list[str]
y: List of tokens, e.g. y = y1, y2, ..., yn     Type: list[str]
f: Dictionary of feature weights                   Type: Dict{features: weights}
'''

def calculate_score(x,y,f):
    assert len(x) == len(y)

    feature_count = {}

    prev_2_tag = None
    prev_tag = 'START'
    score = 0

    length = len(x)
    for i in range(length):
        e_key = "emission: " + y[i] + '+' + x[i]
        t_key = "transition: " + prev_tag + '+' + y[i]

        if e_key in f.keys():
            feature_count[e_key] = feature_count.get(e_key, 0) + 1

        if t_key in f.keys():
            feature_count[t_key] = feature_count.get(t_key, 0) + 1
        
        if i > 1 and i < length - 1:
            tt_key = "triple_transition: " + prev_2_tag + '+' + prev_tag + '+' + y[i]
            if tt_key in f.keys():
                feature_count[tt_key] = feature_count.get(tt_key, 0) + 1
        prev_2_tag = prev_tag
        prev_tag = y[i]

        
    t_key = "transition: " + prev_tag + '+' + 'STOP'
    if t_key in f.keys():
        feature_count[t_key] = feature_count.get(t_key, 0) + 1

    tt_key = "triple_transition: " + prev_2_tag + '+' + prev_tag + '+' + 'STOP'
    if tt_key in f.keys():
        feature_count[tt_key] = feature_count.get(tt_key, 0) + 1
    
    for feature, count in feature_count.items():
        score += f[feature] * count

    return score

In [14]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

token_tags = [token_tag for sent in train_dataset for token_tag in sent]
train_tokens = [[token for token, tag in sent] for sent in train_dataset]
train_tags= [[tag for token, tag in sent] for sent in train_dataset]

def build_vocab(words):
    counter = Counter()
    for word_lst in words:
        word_lst = ['START'] + word_lst + ['STOP']
        counter.update(word_lst)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    return vocab(ordered_dict, specials=('START', 'STOP', 'PAD'))

def build_data(token_vocab, tag_vocab, train_dataset=train_dataset):
    data = []
    for sent in train_dataset:
        token_tensor = torch.LongTensor([token_vocab[token] for token, tag in sent])
        tag_tensor = torch.LongTensor([tag_vocab[tag] for token, tag in sent])
        
        # tag_tensor = F.one_hot(tag_tensor, num_classes=len(tag_vocab))
        data.append((token_tensor, tag_tensor))
    return data

train_vocab, train_tags_vocab = build_vocab(train_tokens), build_vocab(train_tags)
train_vocab.set_default_index(train_vocab['a'])
train_tags_vocab.set_default_index(train_tags_vocab['PAD'])
train_data = build_data(train_vocab, train_tags_vocab)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
TAGS = ['O', 'B-positive', 'I-positive', 'B-negative', 'I-negative', 'B-neutral', 'I-neutral']

BATCH_SIZE = 32
SEQ_LENGTH = 75
START_TOKEN_IDX, STOP_TOKEN_IDX, PAD_TOKEN_IDX = train_vocab['START'], train_vocab['STOP'], train_vocab['PAD']
START_TAG_IDX, STOP_TAG_IDX, PAD_TAG_IDX = train_tags_vocab['START'], train_tags_vocab['STOP'], train_tags_vocab['PAD']

def process_sentence(sent_tensor, start_idx=START_TOKEN_IDX, stop_idx=STOP_TOKEN_IDX, pad_idx=PAD_TOKEN_IDX):
    sent_tensor = torch.cat([torch.tensor([start_idx]), sent_tensor, torch.tensor([stop_idx])])
    if sent_tensor.shape[0] < SEQ_LENGTH:
        sent_tensor = torch.cat([sent_tensor, torch.tensor([pad_idx] * (SEQ_LENGTH - sent_tensor.shape[0]))])
    return sent_tensor

def process_text_data(train_data):
    token_lens = [len(token_tensor) for token_tensor, tag_tensor in train_data]

    padded_token_tensors = [process_sentence(token_tensor, START_TOKEN_IDX, STOP_TOKEN_IDX, PAD_TOKEN_IDX) for token_tensor, tag_tensor in train_data]
    padded_tag_tensors = [F.one_hot(process_sentence(tag_tensor, START_TAG_IDX, STOP_TAG_IDX, PAD_TAG_IDX), num_classes=len(train_tags_vocab)) for token_tensor, tag_tensor in train_data]
    return [(token_tensor, tag_tensor) for token_tensor, tag_tensor in zip(padded_token_tensors, padded_tag_tensors)]

processed_data = process_text_data(train_data)
data_size = len(processed_data)
processed_train_data = processed_data[:int(data_size * 0.8)]
processed_val_data = processed_data[int(data_size * 0.8):]
train_dataloader = DataLoader(processed_train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(processed_val_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

  return torch._C._cuda_getDeviceCount() > 0


In [16]:
class ABSA_model(nn.Module):
    def __init__(self, vocab_size, num_tags, embedding_dim, hidden_dim, n_layers=3):
        super(ABSA_model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=True, batch_first=True, dropout=0.3)
        self.linear = nn.Linear(hidden_dim * 2, num_tags*SEQ_LENGTH)
        self.sigmoid = nn.Sigmoid()
        self.hidden_dim = hidden_dim
        self.num_tags = num_tags
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(x)
        x = x.view(x.shape[0], SEQ_LENGTH, self.num_tags, SEQ_LENGTH)
        x = self.sigmoid(x)
        return x[:, :, :, -1]

In [17]:
model = ABSA_model(vocab_size=len(train_vocab), num_tags=len(train_tags_vocab), embedding_dim=100, hidden_dim=100, n_layers=2)
model.to(device)

print(model)

ABSA_model(
  (embedding): Embedding(3977, 100)
  (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (linear): Linear(in_features=200, out_features=750, bias=True)
  (sigmoid): Sigmoid()
)


In [18]:
import copy

lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()

def train(model, optimizer, criterion, train_dataloader, epochs=250, early_stopping=5):
    early_stopping_losses = []
    models = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for i, (token_batch, tag_batch) in enumerate(train_dataloader):
            token_batch, tag_batch = token_batch.to(device), tag_batch.to(device)
            model.zero_grad()
            output = model(token_batch)
            tag_batch = tag_batch.to(device, dtype=torch.float32)
            loss = criterion(output, tag_batch)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
        train_loss /= len(train_dataloader)
        val_loss = 0.0
        model.eval()
        for i, (token_batch, tag_batch) in enumerate(val_dataloader):
            token_batch, tag_batch = token_batch.to(device), tag_batch.to(device)
            output = model(token_batch)
            tag_batch = tag_batch.to(device, dtype=torch.float32)
            loss = criterion(output, tag_batch)
            val_loss += loss.item()
        val_loss /= len(val_dataloader)
        print('Epoch: {}/{}'.format(epoch, epochs), ' Loss: {:.4f}'.format(train_loss), ' Val Loss: {:.4f}'.format(val_loss))
        
        # early stoppping
        models.append(copy.deepcopy(model))
        early_stopping_losses.append(val_loss)
        if len(early_stopping_losses) > 10:
            models.pop(0)
            early_stopping_losses.pop(0)
        if early_stopping_losses[-1] > early_stopping_losses[0]:
            print('Early stopping at epoch: {}'.format(epoch))
            model = models[np.argmax(early_stopping_losses)]
            break
        

In [19]:
train(model, optimizer, criterion, train_dataloader, epochs=250)

Epoch: 0/250  Loss: 0.1716  Val Loss: 0.0403
Epoch: 1/250  Loss: 0.0314  Val Loss: 0.0255
Epoch: 2/250  Loss: 0.0236  Val Loss: 0.0202
Epoch: 3/250  Loss: 0.0192  Val Loss: 0.0167
Epoch: 4/250  Loss: 0.0169  Val Loss: 0.0153
Epoch: 5/250  Loss: 0.0155  Val Loss: 0.0138
Epoch: 6/250  Loss: 0.0142  Val Loss: 0.0128
Epoch: 7/250  Loss: 0.0129  Val Loss: 0.0120
Epoch: 8/250  Loss: 0.0117  Val Loss: 0.0110
Epoch: 9/250  Loss: 0.0103  Val Loss: 0.0101
Epoch: 10/250  Loss: 0.0092  Val Loss: 0.0097
Epoch: 11/250  Loss: 0.0083  Val Loss: 0.0092
Epoch: 12/250  Loss: 0.0075  Val Loss: 0.0093
Epoch: 13/250  Loss: 0.0068  Val Loss: 0.0089
Epoch: 14/250  Loss: 0.0062  Val Loss: 0.0092
Epoch: 15/250  Loss: 0.0056  Val Loss: 0.0098
Epoch: 16/250  Loss: 0.0052  Val Loss: 0.0093
Epoch: 17/250  Loss: 0.0048  Val Loss: 0.0096
Epoch: 18/250  Loss: 0.0043  Val Loss: 0.0099
Epoch: 19/250  Loss: 0.0040  Val Loss: 0.0095
Epoch: 20/250  Loss: 0.0038  Val Loss: 0.0098
Early stopping at epoch: 20


In [20]:
idx_to_str = train_tags_vocab.get_itos()

def predict(model, tokens):
    padding_size = SEQ_LENGTH - len(tokens) - 2
    sent_tensor = torch.Tensor([train_vocab[token] for token in tokens])
    token_tensor = torch.unsqueeze(process_sentence(sent_tensor), 0)
    token_tensor = token_tensor.to(device, dtype=torch.int32)

    model.eval()
    output = model(token_tensor)
    output = output.detach().cpu().numpy()
    output = np.squeeze(output)[1:-padding_size-1]

    output_idx = np.argmax(output, axis=1)
    tags = []
    for idx in output_idx:
        tags.append(idx_to_str[idx])
    return tags

In [21]:
def read_dev_in_file(filename):
    with open(filename, encoding='utf-8') as f:
        file_content = f.read()

    # Split the entire file into sentences. Output: List of sentences
    sentences = file_content.strip().split('\n\n')

    # Split each sentence into their tokens
    # Output: List of sentences. Each sentence is a list of tokens
    tokens = [i.split('\n') for i in sentences]

    return tokens

def predict_dev_in(model, filename, output_filename):
    sentences = []
    with open(filename) as file:
        lines = file.readlines()
        sentence = list()
        for line in lines:
            formatted_line = line.strip()   
            
            if(len(formatted_line) ==0):
                sentences.append(sentence)
                sentence = []
                continue
            sentence.append(formatted_line)

    with open(output_filename, "w") as wf:
        for sentence in sentences:
            pred = predict(model, sentence)       
            for i in range(len(sentence)):
                wf.write(sentence[i] + " " + pred[i] + "\n")
                
            wf.write("\n")

In [22]:
predict_dev_in(model, 'dataset/dev.in', 'dataset/dev.out.lstm')

In [23]:
from conlleval import evaluate

def eval(pred,gold):
    f_pred = open(pred,encoding = 'utf-8')
    f_gold = open(gold,encoding = 'utf-8')
    data_pred = f_pred.readlines()
    data_gold = f_gold.readlines()
    gold_tags = list()
    pred_tags = list()
    
    for sentence in range(len(data_gold)):
        words_pred = data_pred[sentence].strip().split(' ')
        words_gold = data_gold[sentence].strip().split(' ')  
        if len(words_gold)==1:
            continue
        # Write original word and predicted tags
        gold_tags.append(words_gold[1])
        pred_tags.append(words_pred[1])
        # End of sentence, write newline
    return gold_tags,pred_tags


g_tags, p_tags = eval('dataset/dev.out.lstm', 'dataset/dev.out')
print(evaluate(g_tags,p_tags,verbose=True))

processed 3809 tokens with 210 phrases; found: 178 phrases; correct: 77.
accuracy:  29.97%; (non-O)
accuracy:  93.31%; precision:  43.26%; recall:  36.67%; FB1:  39.69
         negative: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
          neutral: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
         positive: precision:  43.26%; recall:  56.20%; FB1:  48.89  178
(43.258426966292134, 36.666666666666664, 39.69072164948454)
