# 2023 CITS4012 Assignment
*Make sure you change the file name with your student id.*

# Readme
*If there is something to be noted for the marker, please mention here.* 

*If you are planning to implement a program with Object Oriented Programming style, please check the bottom of the this ipynb file*

In [1]:
# Installing spacy for nltk
!pip install nltk



In [2]:
# Installing spacy for Named Entity Tagging
!pip install spacy

Collecting typing-extensions>=4.2.0
  Downloading typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4
    Uninstalling typing-extensions-3.7.4:
      Successfully uninstalled typing-extensions-3.7.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 4.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed typing-extensions-4.5.0


In [3]:
# To Tabulate the values
!pip install tabulate



In [5]:
# To overrie the error while installing en_core_web_sm

import os

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

In [6]:
# Downloading the pre-trained NLP Model for Named Entity Tagging
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/naufaln/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/naufaln/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/naufaln/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/naufaln/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Importing Libraries

In [17]:
import pprint
import re
import math
import tensorflow as tf

# For parsing our XML data
from lxml import etree 
import numpy as np
import pandas as pd
from tabulate import tabulate
from statistics import median

# For data processing
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer

# For Named Enity Tagging
import spacy
import en_core_web_sm
from spacy import displacy
from collections import Counter

# importing necessary libraries for TF-IDF
from nltk.tokenize import TreebankWordTokenizer

# For Modelling
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

#### Importing Data Sets

In [9]:
# importing Training and Testing Data
training_data = pd.read_csv('./Data/WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('./Data/WikiQA-test.tsv', sep='\t')

#### Formatting the Data Frame

In [10]:
def shrink_columns(df):
    # Create a new dataframe with four columns
    new_df = pd.DataFrame(columns=['QuestionID', 'Question', 'Document', 'Answer'])

    # Loop through the unique QuestionIDs in the original dataframe
    for qid in df['QuestionID'].unique():
        # Get the first question associated with this QuestionID
        first_question = df.loc[df['QuestionID'] == qid, 'Question'].iloc[0]
        
        # Get all sentences associated with this QuestionID
        sentences = df.loc[df['QuestionID'] == qid, 'Sentence']
        
        # Concatenate all sentences into a single string
        concatenated_sentence = ' '.join(sentences)
        
        # Get the sentence associated with this QuestionID where the Label is 1
        answer = df.loc[(df['QuestionID'] == qid) & (df['Label'] == 1), 'Sentence']
        
        if not answer.empty:
            answer = answer.iloc[0]
        else:
            answer = None
        
        # Add the QuestionID, first_question, concatenated_sentence, and answer to the new dataframe
        new_row = {'QuestionID': qid, 'Question': first_question, 'Document': concatenated_sentence, 'Answer': answer}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

    return new_df

In [11]:
formatted_training_data = shrink_columns(training_data)
formatted_test_data = shrink_columns(test_data)

### Functions

#### Function for Labelling the document tokens

In [21]:
def generateLabels(document, answer):
    labels = ["[Not Answer]" for i in range(len(document))]
    if(answer != ''):
        start_index = document.find(answer)
        end_index = start_index + len(answer)
        for j in range(start_index, end_index):
            labels[j] = '[Answer]'
    return labels

#### Function for tokenising a sentance

In [15]:
def tokenize(sentance):
    sent_text=[]
    content_text = re.sub(r'\([^)]*\)', '', sentance.lower())
    sent_text.extend(word_tokenize(content_text))


    #commenting it in doubt whether to remove punctuations or not
    
    # Removing punctuation and changing all characters to lower case
    normalized_text = []
    for string in sent_text:
        tokens = re.sub(r"[^a-z0-9.]+", "", string.lower())
        normalized_text.append(tokens)
    
    return normalized_text

#### Function for Tokenising a list of sentances

In [16]:
def tokenizeList(sequences):
    tokenized_list = []
    for seq in sequences:
        tokenized_list.append(tokenize(seq))

    return tokenized_list

#### Function for word embedding a sentance (Using Word2Vec - Skip Gram Model)

In [None]:
def word2Vec(sentance):
    # Now we switch to a Skip Gram model by setting parameter sg=1
    wv_sg_model = Word2Vec(sentences=sentance, size=100, window=5, min_count=5, workers=2, sg=1)
    word_2_vec = []
    for word in sentance:
        word_2_vec = word_2_vec.append(wv_sg_model.wv[word])
    return word_2_vec


#### Function for word embedding a document

In [None]:
def word2VecDocuments(document):
    word_2_vec = []
    for sentance in document:
        word_2_vec = word_2_vec.append(word2Vec(sentance))
    return word_2_vec


#### Function to get the average length of a sequence

In [19]:
def getAverageLength(sequences):
    list_of_lengths = []
    avg_length = 0
    for seq in sequences:
        list_of_lengths.append(len(seq))
    
    avg_length = round(sum(list_of_lengths)/len(list_of_lengths))
    return avg_length

#### Function to add padding to the sequences

In [20]:
def pad_sequences(sequences):
    # Find the max length of the sequences
    max_length = round( max(len(seq) for seq in sequences))
    print(max_length)
    
    # Pad the sequences based on the max length
    padded_sequences = []

    for seq in sequences:
        num_padding = max_length - len(seq)
        padded_seq = seq + ['[PAD]'] * num_padding
        padded_sequences.append(padded_seq)
    
    return padded_sequences

#### Function to find the TF-IDF values

In [22]:
def tfIdf(tokens):
    tf_idf_list = list()
    DF = {}

    # get each unique word in the doc - and count the number of occurrences in the document
    for term in np.unique(tokens):
        try:
            DF[term] +=1
        except:
            DF[term] =1

    tf_idf = []
    N = len(tokens) 
    doc_id = 0
    counter = Counter(tokens)
    total_num_words = len(tokens) 
    for term in tokens:
        tf = counter[term]/total_num_words
        df = DF[term]
        idf = math.log(N/(df+1))+1
        tf_idf.append(tf*idf)

    doc_id += 1
    tf_idf_list.append(tf_idf)

    return tf_idf_list[0]

#### Function to get POS tags

In [23]:
def posTagging(tokens):
    tagged_words = pos_tag(tokens)
    tagged_words_list, tags_list = zip(*tagged_words)
    return tags_list

#### Function to find the Named Entity Tags

In [24]:
def nerTagging(document):
    NE_Tag_table = []
    tokens = []
    # loading pre-trained model of NER
    entity_tagging_model = en_core_web_sm.load()
    article = entity_tagging_model(document)
    sentences = [x for x in article.sents]
    for sentence in sentences:
        for word in sentence:
            NE_Tag_table.append(str(word.ent_type_))
            tokens.append(str(word).lower())
    for i in range(len(NE_Tag_table)):
        if(NE_Tag_table[i] == ''):
            NE_Tag_table[i] = "O"

    return tokens, NE_Tag_table

#### Function to get the wordnet POS tag and convert to use with lemmatizer

In [25]:
def getWordnetPos(tags):
    if tags.startswith('J'):
        return 'a'  # Adjective
    elif tags.startswith('V'):
        return 'v'  # Verb
    elif tags.startswith('N'):
        return 'n'  # Noun
    elif tags.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'

#### Function to Lemmattize the words using the POS tags

In [None]:
def lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmitized = [lemmatizer.lemmatize(token, pos=getWordnetPos(tag)) for token,tag in zip(tokens['Words'], tokens['POS Tags']) ]  
    return lemmitized

# To be Removed

In [12]:
train_data = formatted_training_data[11:16]

# 2.QA Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

# 3.Model Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.1. Input Embedding Ablation Study

(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 



###3.2. Attention Ablation Study
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 

###3.3. Hyper Parameter Testing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title) 