## Reading the dataset

In [20]:
import pandas as pd
df = pd.read_csv('Data/WikiQA-train.tsv', sep='\t')
df = df.head(100)
df

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0
...,...,...,...,...,...,...,...
95,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-10,The initial major publishers of the books were...,0
96,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-11,The books have since been published by many pu...,0
97,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-12,"The books, with the seventh book split into tw...",0
98,Q17,how much are the harry potter movies worth,D17,Harry Potter,D17-13,The series also originated much tie-in merchan...,1


## Data Wrangling
Splitting the Dataframe into 3 Columns: (Question, Document, Answer)

In [21]:
# Create a new dataframe with four columns
new_df = pd.DataFrame(columns=['Question', 'Document', 'Answer'])

# Loop through the unique QuestionIDs in the original dataframe
for qid in df['QuestionID'].unique():
    # Get the first question associated with this QuestionID
    first_question = df.loc[df['QuestionID'] == qid, 'Question'].iloc[0]
    
    # Get all sentences associated with this QuestionID
    sentences = df.loc[df['QuestionID'] == qid, 'Sentence']
    
    # Concatenate all sentences into a single string
    concatenated_sentence = ' '.join(sentences)
    
    # Get the sentence associated with this QuestionID where the Label is 1
    answer = df.loc[(df['QuestionID'] == qid) & (df['Label'] == 1), 'Sentence']
    
    # If there is at least one such row, get the first sentence
    if not answer.empty:
        answer = answer.iloc[0]
    else:
        answer = ''
    
    # Add the QuestionID, first_question, concatenated_sentence, and answer to the new dataframe
    new_row = {'Question': first_question, 'Document': concatenated_sentence, 'Answer': answer}
    new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

# Print the new dataframe
new_df.head(16)
               
#print(new_df.iloc[0][2])

Unnamed: 0,Question,Document,Answer
0,how are glacier caves formed?,A partly submerged glacier cave on Perito More...,A glacier cave is a cave formed within the ice...
1,How are the directions of the velocity and for...,"In physics , circular motion is a movement of ...",
2,how did apollo creed die,Apollo Creed is a fictional character from the...,
3,how long is the term for federal judges,"In the United States, the title of federal jud...",
4,how a beretta model 21 pistols magazines works,The Beretta 21A Bobcat is a small pocket-sized...,
5,how a vul works,Variable Universal Life Insurance (often short...,
6,how an outdoor wood boiler works,The outdoor wood boiler is a variant of the cl...,
7,how big did girl scout cookie boxes used to be,A mound of Girl Scout cookies. This mound cont...,
8,how big is the purdue greek system,"University Hall Purdue University, located in ...",
9,how big do sebaceous cysts get,A sebaceous cyst () is a term that loosely ref...,


In [22]:
# new_df['tokenize'] = ''

# for paragraph in new_df.Document:
#   token = ['0' for i in range(len(paragraph))]
#   if new_df.Answer == '':
#     start_index = end_index = -1
#     print(1)
#   else:
#     start_index = paragraph.find(new_df.Answer)
#     end_index = start_index + len(new_df.Answer)
#     print(start_index)
#     print(end_index)



  

### Labeling each token of the document
None: Not part of the answer  
S: Start token of the answer  
E: End token of the answer  
I: Inner token of the answer

In [23]:
new_df['labels'] = ''

for i in range(len(new_df)):
  paragraph = new_df.loc[i, 'Document']
  token = ['None' for i in range(len(paragraph))]
  if new_df.loc[i, 'Answer'] != '':
    start_index = paragraph.find(new_df.loc[i, 'Answer'])
    end_index = start_index + len(new_df.loc[i, 'Answer'])
    token[start_index] = 'S'
    for j in range(start_index+1, end_index):
      token[j] = 'I'
    token[end_index-1] = 'E'
  new_df.at[i, 'labels'] = token

In [24]:
new_df

Unnamed: 0,Question,Document,Answer,labels
0,how are glacier caves formed?,A partly submerged glacier cave on Perito More...,A glacier cave is a cave formed within the ice...,"[None, None, None, None, None, None, None, Non..."
1,How are the directions of the velocity and for...,"In physics , circular motion is a movement of ...",,"[None, None, None, None, None, None, None, Non..."
2,how did apollo creed die,Apollo Creed is a fictional character from the...,,"[None, None, None, None, None, None, None, Non..."
3,how long is the term for federal judges,"In the United States, the title of federal jud...",,"[None, None, None, None, None, None, None, Non..."
4,how a beretta model 21 pistols magazines works,The Beretta 21A Bobcat is a small pocket-sized...,,"[None, None, None, None, None, None, None, Non..."
5,how a vul works,Variable Universal Life Insurance (often short...,,"[None, None, None, None, None, None, None, Non..."
6,how an outdoor wood boiler works,The outdoor wood boiler is a variant of the cl...,,"[None, None, None, None, None, None, None, Non..."
7,how big did girl scout cookie boxes used to be,A mound of Girl Scout cookies. This mound cont...,,"[None, None, None, None, None, None, None, Non..."
8,how big is the purdue greek system,"University Hall Purdue University, located in ...",,"[None, None, None, None, None, None, None, Non..."
9,how big do sebaceous cysts get,A sebaceous cyst () is a term that loosely ref...,,"[None, None, None, None, None, None, None, Non..."


In [25]:
print(new_df.labels[0])

['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None',

## TF-IDF 

In [26]:
# importing necessary libraries for TF-IDF
from nltk.tokenize import TreebankWordTokenizer
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
import numpy as np
from collections import Counter
import math

# Create a new column in the dataframe for the tokenized version of the document
new_df['DF'] = ""
new_df['TFIDF'] = ""

tknzr = TreebankWordTokenizer()
sww = sw.words()

# function to tokenize the document and remove stopwords
def tokenize(text):
  tokens = tknzr.tokenize(text)

  # Remove stopwords, commented now for testing
  # tokens = [token for token in tokens if token not in sww]
  return tokens 

# looping through the dataframe to tokenize the document, remove punctiations and convert to lowercase
for i in range(len(new_df)):
  clean_doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', new_df.iloc[i][1])
  clean_doc = re.sub(r'\s+', ' ', clean_doc)
  clean_doc = clean_doc.lower()
  new_df.at[i, 'Document'] = clean_doc

# adding this tokenized version of the document to the dataframe
new_df['tokenize'] = new_df['Document'].apply(tokenize)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jithfernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# looping through the dataframe to calculate the DF and TF-IDF
for i in range(len(new_df)):
    clean_doc = new_df.iloc[i][6]
    DF = {}

    for tokenized_doc in clean_doc:
        # get each unique word in the doc - and count the number of occurrences in the document
        for term in np.unique(tokenized_doc):
            try:
                DF[term] +=1
            except:
                DF[term] =1

    # add the DF dictionary to the DataFrame as a new column
    new_df.at[i, 'DF'] = DF

    tf_idf = {}
    N = len(clean_doc) 
    doc_id = 0
    for tokenized_doc in clean_doc:
        counter = Counter(clean_doc)
        total_num_words = len(tokenized_doc) 
        for term in np.unique(tokenized_doc):
            tf = counter[term]/total_num_words
            df = DF[term]
            idf = math.log(N/(df+1))+1
            tf_idf[doc_id, term] = tf*idf

    doc_id += 1

    # add the TFIDF dictionary to the DataFrame as a new column
    new_df.at[i, 'TFIDF'] = tf_idf

In [28]:
new_df

Unnamed: 0,Question,Document,Answer,labels,DF,TFIDF,tokenize
0,how are glacier caves formed?,a partly submerged glacier cave on perito more...,A glacier cave is a cave formed within the ice...,"[None, None, None, None, None, None, None, Non...","{'a': 4, 'partly': 1, 'submerged': 1, 'glacier...","{(0, 'a'): 13.872398125886477, (0, 'partly'): ...","[a, partly, submerged, glacier, cave, on, peri..."
1,How are the directions of the velocity and for...,in physics circular motion is a movement of an...,,"[None, None, None, None, None, None, None, Non...","{'in': 5, 'physics': 1, 'circular': 4, 'motion...","{(0, 'in'): 10.946811314525242, (0, 'physics')...","[in, physics, circular, motion, is, a, movemen..."
2,how did apollo creed die,apollo creed is a fictional character from the...,,"[None, None, None, None, None, None, None, Non...","{'apollo': 3, 'creed': 5, 'is': 3, 'a': 3, 'fi...","{(0, 'apollo'): 2.397744594586097, (0, 'creed'...","[apollo, creed, is, a, fictional, character, f..."
3,how long is the term for federal judges,in the united states the title of federal judg...,,"[None, None, None, None, None, None, None, Non...","{'in': 6, 'the': 23, 'united': 8, 'states': 8,...","{(0, 'in'): 13.641453882855707, (0, 'the'): 25...","[in, the, united, states, the, title, of, fede..."
4,how a beretta model 21 pistols magazines works,the beretta 21a bobcat is a small pocket sized...,,"[None, None, None, None, None, None, None, Non...","{'the': 3, 'beretta': 4, '21a': 1, 'bobcat': 1...","{(0, 'the'): 3.4849066497880004, (0, 'beretta'...","[the, beretta, 21a, bobcat, is, a, small, pock..."
5,how a vul works,variable universal life insurance often shorte...,,"[None, None, None, None, None, None, None, Non...","{'variable': 3, 'universal': 3, 'life': 14, 'i...","{(0, 'variable'): 2.131666262580975, (0, 'univ...","[variable, universal, life, insurance, often, ..."
6,how an outdoor wood boiler works,the outdoor wood boiler is a variant of the cl...,,"[None, None, None, None, None, None, None, Non...","{'the': 3, 'outdoor': 1, 'wood': 2, 'boiler': ...","{(0, 'the'): 2.83258146374831, (0, 'outdoor'):...","[the, outdoor, wood, boiler, is, a, variant, o..."
7,how big did girl scout cookie boxes used to be,a mound of girl scout cookies this mound conta...,,"[None, None, None, None, None, None, None, Non...","{'a': 2, 'mound': 2, 'of': 6, 'girl': 3, 'scou...","{(0, 'a'): 8.664409020350408, (0, 'mound'): 1....","[a, mound, of, girl, scout, cookies, this, mou..."
8,how big is the purdue greek system,university hall purdue university located in w...,,"[None, None, None, None, None, None, None, Non...","{'university': 10, 'hall': 1, 'purdue': 12, 'l...","{(0, 'university'): 4.218875824868201, (0, 'ha...","[university, hall, purdue, university, located..."
9,how big do sebaceous cysts get,a sebaceous cyst is a term that loosely refers...,,"[None, None, None, None, None, None, None, Non...","{'a': 5, 'sebaceous': 4, 'cyst': 6, 'is': 4, '...","{(0, 'a'): 19.852072327848504, (0, 'sebaceous'...","[a, sebaceous, cyst, is, a, term, that, loosel..."
