# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

In [1]:
# Installing spacy for nltk
!pip install nltk



In [2]:
# Installing spacy for Named Entity Tagging
!pip install spacy



In [3]:
# To Tabulate the values
!pip install tabulate



In [2]:
# To overrie the error while installing en_core_web_sm
import os

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

In [3]:
# Downloading the pre-trained NLP Model for Named Entity Tagging
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/naufaln/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/naufaln/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/naufaln/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/naufaln/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Importing Libraries

In [503]:
import re
import time
import math
import tensorflow as tf

# For parsing our XML data
from lxml import etree 
import numpy as np
import pandas as pd
from tabulate import tabulate
from statistics import median

# For data processing
import nltk
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer

# For Named Enity Tagging
import spacy
import en_core_web_sm
from spacy import displacy
from collections import Counter

# importing necessary libraries for TF-IDF
from nltk.tokenize import TreebankWordTokenizer

# For Modelling
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
import torch
# You can enable GPU here (cuda); or just CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

#### Importing Data Sets

In [7]:
# importing Training and Testing Data
training_data = pd.read_csv('./Data/WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('./Data/WikiQA-test.tsv', sep='\t')

#### Formatting the Data Frame

In [8]:
def shrink_columns(df):
    # Create a new dataframe with four columns
    new_df = pd.DataFrame(columns=['QuestionID', 'Question', 'Document', 'Answer'])

    # Loop through the unique QuestionIDs in the original dataframe
    for qid in df['QuestionID'].unique():
        # Get the first question associated with this QuestionID
        first_question = df.loc[df['QuestionID'] == qid, 'Question'].iloc[0]
        
        # Get all sentences associated with this QuestionID
        sentences = df.loc[df['QuestionID'] == qid, 'Sentence']
        
        # Concatenate all sentences into a single string
        concatenated_sentence = ' '.join(sentences)
        
        # Get the sentence associated with this QuestionID where the Label is 1
        answer = df.loc[(df['QuestionID'] == qid) & (df['Label'] == 1), 'Sentence']
        
        if not answer.empty:
            answer = answer.iloc[0]
        else:
            answer = ""
        
        # Add the QuestionID, first_question, concatenated_sentence, and answer to the new dataframe
        new_row = {'QuestionID': qid, 'Question': first_question, 'Document': concatenated_sentence, 'Answer': answer}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

    return new_df

In [9]:
formatted_training_data = shrink_columns(training_data)
formatted_test_data = shrink_columns(test_data)

### Functions

#### Function for Labelling the document tokens

In [10]:
def generateLabels(padded_document, answer, len_org_document):
    labels = ["[Not Answer]" for i in range(len(padded_document))]
    if answer != "":
        start_index = [i for i in range(len(padded_document)-len(answer)+1) if padded_document[i:i+len(answer)] == answer]
        if start_index:
            start_index = start_index[0]
            end_index = start_index + len(answer)
            labels[start_index] = '[Answer]'
            for j in range(start_index+1, end_index):
                labels[j] = '[Answer]'
            labels[end_index-1] = '[Answer]'
    # labelling the padding
    for i in range(len_org_document, len(padded_document)):
        labels[i] = "[Pad]"
    return labels

In [11]:
def removePunctuations(sentence):
    words = sentence.split()
    formatted_sentance = []
    for word in words:
        tokens = re.sub(r"[^a-z0-9]+", '', word.lower())
        formatted_sentance.append(tokens)
    return formatted_sentance

#### Function for tokenising a sentance

In [12]:
def tokenize(sentance):
    sent_text=[]
    normalized_text = []
    sent_text.extend(word_tokenize(sentance))
    
    # Removing punctuation and changing all characters to lower case

    for string in sent_text:
        tokens = re.sub(r"[^a-z0-9.]+", '', string.lower())
        normalized_text.append(tokens)

    final_text = []
    for text in normalized_text:
        if text != '':
            final_text.append(text)

    return final_text

#### Function for Tokenising a list of sentances

In [13]:
def tokenizeList(sequences):
    tokenized_list = []
    for seq in sequences:
        tokenized_list.append(tokenize(seq))

    return tokenized_list

#### Function for word embedding a sentance (Using Word2Vec - Skip Gram Model)

In [14]:
def word2Vec(sentance):
    # Now we switch to a Skip Gram model by setting parameter sg=1
    wv_sg_model = Word2Vec([sentance], vector_size=50, window=3, min_count=1, workers=2, sg=1)

    word_2_vec = list()
    for word in sentance:
        word_2_vec.append(wv_sg_model.wv[word])
    return word_2_vec


#### Function for word embedding a document

In [15]:
def word2VecDocuments(document):
    word_2_vec = list()
    for sentance in document:
        word_2_vec.append(word2Vec(sentance))
    return word_2_vec


#### Function to get the average length of a sequence

In [16]:
def getAverageLength(sequences):
    list_of_lengths = list()
    avg_length = 0
    for seq in sequences:
        list_of_lengths.append(len(seq))
    
    avg_length = round(sum(list_of_lengths)/len(list_of_lengths))
    return avg_length

#### Function to add padding to the sequences

In [17]:
def pad_sequences(sequences):
    # Find the max length of the sequences
    max_length = round( max(len(seq) for seq in sequences))
    
    # Pad the sequences based on the max length
    padded_sequences = list()

    for seq in sequences:
        num_padding = max_length - len(seq)
        padded_seq = seq + ['[PAD]'] * num_padding
        padded_sequences.append(padded_seq)
    
    return padded_sequences

#### Function to find the TF-IDF values

In [18]:
def tfIdf(tokens, org_len):
    tf_idf_list = list()
    DF = {}

    # get each unique word in the doc - and count the number of occurrences in the document
    for term in np.unique(tokens):
        try:
            DF[term] +=1
        except:
            DF[term] =1

    tf_idf = []
    N = len(tokens) 
    doc_id = 0
    counter = Counter(tokens)
    total_num_words = len(tokens) 
    for term in tokens[0:org_len]:
        tf = counter[term]/total_num_words
        df = DF[term]
        idf = math.log(N/(df+1))+1
        tf_idf.append(tf*idf)
    for term in range(org_len,len(tokens)):
        tf_idf.append(0)

    doc_id += 1
    tf_idf_list.append(tf_idf)

    return tf_idf_list[0]

#### Function to get POS tags

In [19]:
def posTagging(tokens, len_org_doc):
    tagged_words = pos_tag(tokens[0:len_org_doc])
    tagged_words_list, tags_list = zip(*tagged_words)
    tags_list = list(tags_list)
    for i in range(len_org_doc,len(tokens)):
        tags_list.append('[PAD]')
    return tags_list

#### Function to find the Named Entity Tags

In [20]:
def nerTagging(document):
    NE_Tag_table = []
    tokens = []
    # loading pre-trained model of NER
    entity_tagging_model = en_core_web_sm.load()
    article = entity_tagging_model(document)
    sentences = [x for x in article.sents]
    for sentence in sentences:
        for word in sentence:
            NE_Tag_table.append(str(word.ent_type_))
            tokens.append(str(word).lower())
    for i in range(len(NE_Tag_table)):
        if(NE_Tag_table[i] == ''):
            NE_Tag_table[i] = "O"

    return tokens, NE_Tag_table

#### Function to get the wordnet POS tag and convert to use with lemmatizer

In [21]:
def getWordnetPos(tags):
    if tags.startswith('J'):
        return 'a'  # Adjective
    elif tags.startswith('V'):
        return 'v'  # Verb
    elif tags.startswith('N'):
        return 'n'  # Noun
    elif tags.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'

#### Function to Lemmattize the words using the POS tags

In [22]:
def lemmatization(tokens, tags):
    lemmatizer = WordNetLemmatizer()
    lemmitized = [lemmatizer.lemmatize(tokens[ind], pos=getWordnetPos(tags[ind])) for ind in range(len(tokens))]  
    return lemmitized

#### Function to Preprocess the Questions list

In [495]:
def questionPreprocess(question):
    question_tokens = tokenizeList(question)
    question_tokens_padded = pad_sequences(question_tokens)
    embedded_question_list = word2VecDocuments(question_tokens_padded)
    question_batch_torch = torch.from_numpy(np.array(embedded_question_list)).float().to(device)
    return question_batch_torch


##### Helper functions for training

In [504]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

### Question Summary

#### Tokenising the Question List

In [494]:
question_list = formatted_training_data["Question"]

In [496]:
question_batch_torch = questionPreprocess(question_list)

In [497]:
question_batch_torch = question_batch_torch[0:5]

In [386]:
# The length of each sequence
seq_length = question_batch_torch.shape[1]
# The input feature dimension
n_input = question_batch_torch.shape[2]

# Set the hyperparameters 
n_hidden = 151
learning_rate = 1e-3

In [387]:
class Bi_RNN_Model(nn.Module):
    def __init__(self):
        super(Bi_RNN_Model, self).__init__()
        # set the bidirectional to True
        self.rnn = nn.RNN(n_input, n_hidden, batch_first =True, bidirectional=True)

    def forward(self, x):        
        x, h_n = self.rnn(x)
        # concat the last hidden state from two direction
        hidden_out = torch.cat((h_n[0,:,:],h_n[1,:,:]),1)
        return hidden_out

#### Document Modelling

##### Tokenising the document

In [513]:
document_list = formatted_training_data["Document"]

##### NER Tagging

In [516]:
document_tokens = []
NER_tags = []

for document in document_list:
    tokens, tags = nerTagging(document)
    document_tokens.append(tokens)
    NER_tags.append(tags)

##### Padding the documents

In [517]:
document_tokens_padded = pad_sequences(document_tokens)

##### Adding [PAD] to NER Tags

In [518]:
for ind in range(len(document_tokens_padded)):
    tag_len = len(NER_tags[ind])
    total_len = len(document_tokens_padded[ind])
    for i in range(tag_len,total_len):
        NER_tags[ind].append("[PAD]")

##### Getting the Labels

In [519]:
answer_list = formatted_training_data["Answer"]
answer_tokens = tokenizeList(answer_list)

In [520]:
document_labels = list()

for ind in range(len(document_tokens)):
    len_org_document = len(document_tokens[ind])
    document_labels.append(generateLabels(document_tokens_padded[ind], answer_tokens[ind],len_org_document))


#### Embedding the Labels

In [None]:
embedded_document_labels = word2VecDocuments(document_labels)

In [None]:
target_label_vector_torch = torch.from_numpy(np.array(embedded_document_labels)).float().to(device)

##### Word Embeddings

In [38]:
pos_tags = []
lem_document_tokens = []
tf_idf = []
for ind in range(len(document_tokens_padded)):
    len_org_document = len(document_tokens[ind])
    tags = posTagging(document_tokens_padded[ind],len_org_document)
    pos_tags.append(tags)
    lem_document_tokens.append(lemmatization(document_tokens_padded[ind],tags))
    tf_idf.append(tfIdf(lem_document_tokens[ind],len_org_document))

#### Word to Vector

In [39]:
embedded_document_list = word2VecDocuments(lem_document_tokens)
embedded_pos_tags = word2VecDocuments(pos_tags)
embedded_NER_tags = word2VecDocuments(NER_tags)

In [67]:
document_vector = []
for i in range(5):
    embedded_document = embedded_document_list[i]
    embedded_pos = embedded_pos_tags[i]
    embedded_NER = embedded_NER_tags[i]
    tfidf = tf_idf[i]

    
    token_vector =  []
    for j in range(len(embedded_document)):
        vector = []
        vector.extend(embedded_document[j])
        vector.extend(embedded_pos[j])
        vector.extend(embedded_NER[j])
        vector.append(tfidf[j])
        token_vector.append(np.array(vector))
    document_vector.append(token_vector)

In [43]:
# tags = pos_tags[0]
# padded = document_tokens_padded[0]
# lemm = lem_document_tokens[0]
# tfidf = tf_idf[0]
# NERTags = NER_tags[0]
# for i in range(len(tags)):
#     print(tags[i]," : \t",padded[i], " : \t", lemm[i], " : \t", tfidf[i], " : \t", NERTags[i])

DT  : 	 a  : 	 a  : 	 0.027444818892627048  : 	 O
RB  : 	 partly  : 	 partly  : 	 0.006861204723156762  : 	 O
VBN  : 	 submerged  : 	 submerge  : 	 0.006861204723156762  : 	 O
NN  : 	 glacier  : 	 glacier  : 	 0.04116722833894057  : 	 O
NN  : 	 cave  : 	 cave  : 	 0.04802843306209734  : 	 O
IN  : 	 on  : 	 on  : 	 0.006861204723156762  : 	 O
NN  : 	 perito  : 	 perito  : 	 0.006861204723156762  : 	 PERSON
NN  : 	 moreno  : 	 moreno  : 	 0.006861204723156762  : 	 PERSON
NN  : 	 glacier  : 	 glacier  : 	 0.04116722833894057  : 	 PERSON
.  : 	 .  : 	 .  : 	 0.020583614169470285  : 	 O
DT  : 	 the  : 	 the  : 	 0.020583614169470285  : 	 O
NN  : 	 ice  : 	 ice  : 	 0.034306023615783804  : 	 O
NN  : 	 facade  : 	 facade  : 	 0.006861204723156762  : 	 O
VBZ  : 	 is  : 	 be  : 	 0.027444818892627048  : 	 O
RB  : 	 approximately  : 	 approximately  : 	 0.006861204723156762  : 	 O
CD  : 	 60  : 	 60  : 	 0.006861204723156762  : 	 O
NNS  : 	 m  : 	 m  : 	 0.006861204723156762  : 	 O
JJ  : 	 high 

In [103]:
# Convert input into tensors and set them to GPU by using tensor.to(device)
input_document_vector_torch = torch.from_numpy(np.array(document_vector)).float().to(device)

In [51]:
# Here we simply use the maximum sentence length 
MAX_LENGTH = max([len(s) for s in document_tokens_padded])

#### Document Model

In [None]:
input_document_vector_torch = input_document_vector_torch[0:5]
target_label_vector_torch = target_label_vector_torch[0:5]

In [None]:
attention_output = []

In [511]:
class AttnDecoderRNN(nn.Module):
    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    # We will practise the scaled dot product attention in the exercise
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product" 

    def __init__(self, hidden_size, output_size, embedding, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        
        self.embedding = embedding
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, 2*self.hidden_size)
        self.out = nn.Linear(self.hidden_size*4, self.output_size)


    def cal_attention(self, hidden, question_summary, method):
        if method == AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(hidden, question_summary.T.unsqueeze(0)),dim=-1) 
            #print(attn_weights)
            attn_output = torch.bmm(attn_weights, question_summary.unsqueeze(0))
            #print(attn_output.shape)
            attention_output = attn_output
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        return concat_output

    def forward(self, input, hidden, question_summary):
        
        _, hidden = self.gru(input, hidden)

        concat_output = self.cal_attention(hidden, question_summary, AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT)

        output = F.softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#### Train Function

In [499]:
def train(input_question_tensor, input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)


    # it is for storing the hidden states of input sequence later, which will be used for calculating the attention during the decoding process
    encoder_hiddens = torch.zeros(1, 1060, 302, device=device)

    # zero-initialize an initial hidden state 
    #encoder_hidden = encoder.initHidden()

    loss = 0    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # print(input_tensor[10])

    # Feed the input_tensor into the encoder we defined
    # for ind in range(0,question_batch_torch.shape[0],batch_size):
    encoder.train()
    question_summary = encoder(input_question_tensor)

    # Use the <BOS> as the first token into decoder for generation
    decoder_input = input_tensor

    # decoder_hidden = n_hidden 

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        #print(decoder_input[i])
        decoder_output, decoder_hidden = decoder(decoder_input, encoder_hiddens, question_summary)
        target = target_tensor[i]
        for j in range(len(target_tensor[i])):
            loss += criterion(decoder_output[j], target[j]) 

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [456]:
import random
def trainIters(encoder, decoder, n_iters, print_every=200, plot_every=200, learning_rate=0.002):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    #criterion = nn.NLLLoss()
    criterion = nn.CrossEntropyLoss()

    for iter in range(1, n_iters + 1):
        input_question_tensor = question_batch_torch
        input_tensor = input_document_vector_torch
        target_tensor = target_label_vector_torch

        loss = train(input_question_tensor, input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [512]:
hidden_size = 151
embedding = nn.Embedding(len(target_label_vector_torch), hidden_size)
encoder1 = Bi_RNN_Model().to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, 50, embedding, dropout_p=0.1).to(device)


trainIters(encoder1, attn_decoder1, 1, print_every=2)


torch.Size([1, 1060, 302])
torch.Size([1, 1060, 302])
torch.Size([1, 1060, 302])
torch.Size([1, 1060, 302])
torch.Size([1, 1060, 302])


#### Model Evaluation

In [489]:
def evaluate(encoder, decoder, question, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_question = question

        input_length = input_question.size()[0]
        encoder_hidden = n_hidden

        encoder_hiddens = torch.zeros(1,max_length, 2*encoder_hidden, device=device)


        encoder_hidden = encoder(input_question)
        encoder_hiddens += encoder_hidden[0, 0]

        decoder_input = input_document_vector_torch

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, encoder_hiddens, encoder_hidden)
            topv, topi = decoder_output.data.topk(1) # simply adopt the predicted tag with the highest probabiity
            print(len(topi))

            decoded_words.append(document_vector[topi]) # get the predicted word based on the index
            # use the predicted output as the input for the next time step generation
            decoder_input = topi.squeeze().detach()

        return decoded_words

In [507]:
questionlist = question_list[0:5]
preprocessed_question = questionPreprocess(questionlist)
question1 = preprocessed_question

In [490]:
# print(evaluate(encoder1, attn_decoder1, question1, max_length=MAX_LENGTH))

1060


TypeError: only integer tensors of a single element can be converted to an index

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



###3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 