In [13]:
# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import json
import nltk
import string
import urllib
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
from nltk.parse import CoreNLPParser
from nltk.corpus import stopwords
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.stem import PorterStemmer
from nltk.stem.porter import PorterStemmer
import xlwt
from xlwt import Workbook

In [16]:
# Start common things globally
stop_words = stopwords.words('english') + list(string.punctuation)
dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
namedEntityTagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [17]:
# Performs Word tokenization on sentences
def Tokenization(sentence):
    tokens = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]
    return tokens


# Performs Word Lemmatization : Uses context
def Lemmatization(word_tokens):
    lemmas = []
    for token in word_tokens:
        lemmas.append(wordnet_lemmatizer.lemmatize(token))
    return lemmas


# Performs Stemming : Uses word stem
def Stemming(word_tokens):
    stems = [porter.stem(word) for word in word_tokens]
    return stems


# Performs POS tagging
def POSTagging(sentence):
    word_tokens = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]
    POStags = nltk.pos_tag(word_tokens)
    return POStags   


In [18]:
# Performs Dependency Parsing
def DependencyParsing(sentence):
    # Perform dependency parsing
    parse, = dependencyParser.raw_parse(sentence)
    
    # Dependency parsing to parse tree based patterns as features
    depParseResult = list(parse.triples())
    
    return depParseResult


# Obtains Named Entities
def NamedEntities(sentence, tokens):
    # Word tokenize again and use them if NEs are present
    namedTokens = nltk.word_tokenize(sentence)
    NEtags = None
    
    try:
        NEtags = namedEntityTagger.tag(namedTokens)
    except:
        NEtags = namedEntityTagger.tag(tokens)
        
    return NEtags


In [19]:
# Obtains sentence heads
def getHeads(sentence, word_tokens):
    # Create a head list to add the heads
    headList = []
    
    # Split the sentence
    stripedSen = sentence.strip(" '\"")
    if stripedSen != "":
        # Perform dependency parse
        depParse = dependencyParser.raw_parse(stripedSen)
        parseTree = list(depParse)[0]
        headWord = ""
        headWord = [k["word"] for k in parseTree.nodes.values() if k["head"] == 0][0]
        
        # Appends head if it's not empty
        if headWord != "":
            headList.append([headWord])
            
        # Obtain head word based on two cases
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    headList.append([word_tokens[i]])
                    break
            if headWord == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        headList.append([word_tokens[i]])
                        break
                        
    # For empty sentence, we just append "" as head
    else:
        headList.append([""])
 
    return headList


In [20]:
# Obtains WordNet Features
def WordNetFeatures(sentence, word_tokens):
    # Creates dictionaries for important word senses
    hypernyms_list = []
    hyponyms_list = []
    meronyms_list = []
    holonyms_list = []
    synonyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        
        if best_sense is not None:
            # Obtains Synonyms
            synonym = token
            if best_sense. lemmas()[0].name() != token:
                synonym = best_sense.lemmas()[0].name()
            synonyms_list.append(synonym)
            
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
        
            # Obtains Hyponyms
            if best_sense.hyponyms() != []:
                hyponyms_list.append(best_sense.hyponyms()[0].lemmas()[0].name())
            
            # Obtains Meronyms
            if best_sense.part_meronyms() != []:
                meronyms_list.append(best_sense.part_meronyms()[0].lemmas()[0].name())
                
            # Obtains Holonyms
            if best_sense.part_holonyms() != []:
                holonyms_list.append(best_sense.part_holonyms()[0].lemmas()[0].name())
          
        # When there's no best sense, the token itself is the Synonym
        else:
            synonyms_list.append(token)
            
    return hypernyms_list, hyponyms_list, meronyms_list, holonyms_list, synonyms_list
   

In [21]:
# NLP pipeline through which all the articles & question will pass
def NLP_Pipeline(sentence, count, data_dict, articleName = None):
    #print("\n------SENTENCE------")
    #print(sen)

    word_tokens = Tokenization(sentence)
    #print("\nWord Tokenization : Done")
    #print(word_tokens)

    word_NEtags = NamedEntities(sentence, word_tokens)
    #print("\nNamed Entity Tagging : Done")
    #print(word_NEtags)
    
    word_lemmas = Lemmatization(word_tokens)
    #print("Word Lemmatization : Done")
    #print(word_lemmas)
    
    word_stems = Stemming(word_tokens)
    #print("Word Stemming : Done")
    #print(word_stems)

    word_POStags = POSTagging(sentence)
    #print("POS Tagging : Done")
    #print(word_POStags)

    hypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sentence, word_tokens)
    #print("WordNet Feature Extraction : Done")
    #print(holonyms)
            
    depParse = DependencyParsing(sentence)
    #print("Dependency Parsing : Done")
    #print(depParse)

    headList = getHeads(sentence, word_tokens)
    #print("Obtaining Heads : Done")
    #print(headList)

    # Process data format to suit the Elastic Search requirements
    count = count + 1
    data_dict[count] = {}
            
    data_dict[count]["sentence"] = {}
    data_dict[count]["sentence"] = sentence
            
    data_dict[count]["tokenized_text"] = {}
    data_dict[count]["tokenized_text"] = word_tokens
            
    data_dict[count]["lemma"] = {}
    data_dict[count]["lemma"] = word_lemmas
    
    data_dict[count]["stems"] = {}
    data_dict[count]["stems"] = word_stems
    
    data_dict[count]["ner_tag"] = {}
    if articleName is not None:
        data_dict[count]["ner_tag"] = str(dict(word_NEtags))
    else:
        data_dict[count]["ner_tag"] = dict(word_NEtags)
            
    data_dict[count]["tags"] = {}
    data_dict[count]["tags"] = word_POStags
            
    data_dict[count]["dependency_parse"] = {}
    data_dict[count]["dependency_parse"] = depParse
            
    data_dict[count]["synonyms"] = {}
    data_dict[count]["synonyms"] = synonyms
            
    data_dict[count]["hypernyms"] = {}
    data_dict[count]["hypernyms"] = hypernyms
            
    data_dict[count]["hyponyms"] = {}
    data_dict[count]["hyponyms"] = hyponyms
            
    data_dict[count]["meronyms"] = {}
    data_dict[count]["meronyms"] = meronyms
            
    data_dict[count]["holonyms"] = {}
    data_dict[count]["holonyms"] = holonyms
            
    data_dict[count]["head_word"] = {}
    data_dict[count]["head_word"] = headList[0]
    
    # For question, we don't have the article name and then it will have a questionType
    if articleName is not None:
        data_dict[count]["file_name"] = {}
        data_dict[count]["file_name"] = articleName
        
        
    # For question, we should add the question type
    else:
        tokens = nltk.word_tokenize(sentence)
        questionTypes = ["who", "when", "what", "whom"]
        queType = [i for i in questionTypes if i in tokens]
        data_dict[count]["question_type"] = {}
        data_dict[count]["question_type"] = queType
    
    return count, data_dict
    

In [None]:
""" ------------------------ TASK 1 ------------------------ """

# List of all article names in the repository
articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                "222.txt", "226.txt", "247.txt", "273.txt", "281.txt", 
                "282.txt", "285.txt", "287.txt", "288.txt", "297.txt", 
                "304.txt", "342.txt", "347.txt", "360.txt", "390.txt", 
                "400.txt", "428.txt", "56.txt", "58.txt", "6.txt"] 
fileCount = len(articleNames)
    
content = ""
urlPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Articles/"

for i in range(fileCount):
    print("\nStarted Processing File : " + articleNames[i])
    fileName = urlPath + articleNames[i]
    response = urllib.request.urlopen(fileName)
    webContents = response.read()
    stringTypeData = webContents.decode("utf-8")
    content = stringTypeData
    count = 0
    data_dict = {}

    # Get tokenized sentences
    sentences = []
    sentences.extend(tokenizer.tokenize(content))

    # Sentence count
    #print("Total Sentences After splitting the document: ", len(sentences))
    print("Extracting features for each sentence in the file...")
    
    # Extracting words
    for sen in sentences:
        count, data_dict = NLP_Pipeline(sen, count, data_dict, articleNames[i])
                
    output_name = '../Pipeline-Output/Parsed-' + articleNames[i]
    with open(output_name, 'w+', encoding='utf8') as output_file:
        json.dump(data_dict, output_file,  indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False)
        
    print("Completed Processing File : " + articleNames[i])
        
print("\nTask 1 Successfully Completed !!!")
    

In [2]:
""" ------------------------ TASK 2 - PART 1 ------------------------ """

# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import ssl
import json
import urllib
import requests
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection


In [3]:
# List of all article names in the repository
articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                "222.txt", "226.txt", "247.txt", "273.txt", "281.txt", 
                "282.txt", "285.txt", "287.txt", "288.txt", "297.txt", 
                "304.txt", "342.txt", "347.txt", "360.txt", "390.txt", 
                "400.txt", "428.txt", "56.txt", "58.txt", "6.txt"] 
fileCount = len(articleNames)
    
# Setup Elastic Search
elastic = Elasticsearch([{'host': 'localhost', 'port': 9200, 'use_ssl' : False, 'ssl_verify' : False}], timeout=30, max_retries=10)
    
# Obtain requests from the page
req = requests.get("http://localhost:9200", verify=False)
    
# Use indexing
idx = 1
    
content = ""
urlPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Pipeline-Output/Parsed-"
    
for i in range(fileCount):
    print("\nStarted Processing File : " + articleNames[i])
    fileName = urlPath + articleNames[i]
    response = urllib.request.urlopen(fileName)
    webContents = response.read()
    stringTypeData = webContents.decode("utf-8")
    content = stringTypeData
        
    # Obtain Json data from file contents
    jsonFile = json.loads(content)
        
    # Creating new index "articles" for each line in the article
    for key, value in jsonFile.items():
        elastic.index(index = "articles", doc_type = "text", id = idx, body = value)
        # print("Here")
        idx += 1
            
    print("Finished Processing File : " + articleNames[i])
        
print("\nElastic Search Successfully Completed !!!")
    


Started Processing File : 109.txt
Finished Processing File : 109.txt

Started Processing File : 111.txt
Finished Processing File : 111.txt

Started Processing File : 151.txt
Finished Processing File : 151.txt

Started Processing File : 160.txt
Finished Processing File : 160.txt

Started Processing File : 177.txt
Finished Processing File : 177.txt

Started Processing File : 179.txt
Finished Processing File : 179.txt

Started Processing File : 181.txt
Finished Processing File : 181.txt

Started Processing File : 196.txt
Finished Processing File : 196.txt

Started Processing File : 199.txt
Finished Processing File : 199.txt

Started Processing File : 220.txt
Finished Processing File : 220.txt

Started Processing File : 222.txt
Finished Processing File : 222.txt

Started Processing File : 226.txt
Finished Processing File : 226.txt

Started Processing File : 247.txt
Finished Processing File : 247.txt

Started Processing File : 273.txt
Finished Processing File : 273.txt

Started Processing 

In [4]:
""" ------------------------ TASK 2 - PART 2 ------------------------ """
# Import all necessary packages
import dateparser
import ast
import re

In [6]:
# Obtains features in the question by using the result obtained from the NLP Pipeline
def questionFeatures(question):
    # Get all the wordnet features
    WNfeatures = question[1]['synonyms'] + question[1]['meronyms'] + question[1]['hyponyms'] + question[1]['holonyms'] + question[1]['hypernyms']
       
    # Create hints for easy search using Named Entities and the Sentence head
    head = question[1]['head_word'][0]
    NEs = question[1]['ner_tag']
    NEhints = ""
    namedEntities = []
    
    for word, entity in NEs.items():
        namedEntities.append(entity)
        if entity == 'ORGANIZATION' or entity == 'LOCATION' or entity == 'PERSON':
            NEhints += " " + word + " "
            
    NEhints += " " + head + " "
    
    # Obtain question type and other features
    queType = question[1]['question_type']
    lemmas = question[1]['lemma']
    stems = question[1]['stems']
    depParse = question[1]['dependency_parse']

    depList = list(list(x) for x in depParse)
    depElements = []
    
    for i in depList:
        if i[1] == 'nsubj' or i[1] == 'dobj':
            depElements.append(i[0])
     
    # Retrieve main elements from the dependency parse result
    dependencyList = list(list(x) for x in depElements)

    return NEhints, WNfeatures, queType, lemmas, stems, dependencyList


In [7]:
# Check and obtain matched sentences using the query string
def GetMatchedSentences(queryStr, dependencyList):
    querybody = {
        "query": {
            "dis_max": {
                "queries": [
                    # { "match": { "lemma": {"query": spclQuery,"boost": 2}  }},
                    {"multi_match": {'query': queryStr, "fields": [
                        # "lemma^2.0", "synonyms^0.5", "meronyms^0.1", "holonyms^0.1", "hypernyms^0.1", "hyponyms^0.1"]}},
                        "lemma^2", "ner_tag", "synonyms^0.7", "meronyms^0.1", "holonyms^0.1", "hypernyms^0.5", "hyponyms^0.2"]}},
                    ]
                }
            }
        }

    result = elastic.search(index = "articles", body=querybody)
    answers = result['hits']['hits']
    depParses, sentences, scores, articles, NEs = [], [], [], [], []
    
    for i in range(len(answers)):
        sentence = result['hits']['hits'][i]['_source']['sentence']
        sentences.append(sentence)
        
        score = result['hits']['hits'][i]['_score']
        scores.append(score)
        
        depParse = result['hits']['hits'][i]['_source']['dependency_parse']
        depParses.append(depParse)
        
        article = result['hits']['hits'][i]['_source']['file_name']
        articles.append(article)
        
        NE = result['hits']['hits'][i]['_source']['ner_tag']
        NEs.append(NE)
        
    return sentences, scores, depParses, articles, NEs


In [48]:
# Find the match score to know how well a statement is matched
def FindScore(queType, NEhints, sentences, scores, depParses, articles, NEs, dependencyList):
    # Add additional World Knowledge to implement a much deeper NLP pipeline
    # Named Entities
    organizations = ['ORGANIZATION']
    persons = ['PERSON']
    locations = ['LOCATION', 'PLACE', 'CITY', 'COUNTRY', 'STATE_OR_PROVINCE']
    times = ['TIME', 'DATE', 'NUMBER']
    
    # Feeding world knowledge for a deeper pipeline
    deaths = ['die', 'died', 'assassination']
    births = ['born', 'birth', 'life']
    keywords = NEhints.split()
    keywords = [item.lower() for item in keywords]
    
    # Obtain relations using Dependency Parse result
    count = 0
    relations = []
    for dep in depParses:
        for i in dep:
            if i[1] == 'nsubj' or i[1] == 'dboj':
                if i[0] in dependencyList:
                    relations.append([count,i[0]])
        count += 1
        
    # Get question type
    questionType = queType[0].lower()
    answers = [] 
    
    # Set for relation
    """print("\n", relations)
    for reln in relations:
        idx = relations[0]
        print(idx)
        scores[idx] += 100"""

    # Handle different question types
    if questionType == 'who' or questionType == 'whom':
        for NE in NEs:
            # Obtain all the named entities which are initially stored as a stringified dictionary
            NEdict = eval(NE)
            ans = []
            for key, value in NEdict.items():
                if value in persons or organizations:
                    ans.append(key)
                if (ans != [] and key == ',') or (ans != [] and key == 'and'):
                    ans.append(key)
                    
            answers.append(' '.join(ans))

    if questionType == 'when':
        for NE in NEs:
            # Obtain all the named entities which are initially stored as a stringified dictionary
            NEdict = eval(NE)
            ans = []
            for key, value in NEdict.items():
                if value in times and dateparser.parse(key) is not None:
                    ans.append(key)

            answers.append(' '.join(ans))

    """if questionType == 'what':
        for NE in NEs:
            # Obtain all the named entities which are initially stored as a stringified dictionary
            NEdict = eval(NE)
            ans = []
            for key, value in NEdict.items():
                if value in locations or value in organizations:
                    ans.append(key)
                if (ans != [] and key == ',') or (ans != [] and key == 'and'):
                    ans.append(key)
                    
            answers.append(' '.join(ans))"""

    
    for idx in range(len(answers)):
        if len(answers[idx]) < 3:
            scores[idx] -= 100
    
    """# Level 2 handling for When questions as it can also be about births & deaths
    dieconcept = 0       
    if questionType == 'when':
        for key in range(len(sentences)):
            for j in deaths:
                if j in sentences[key]:
                    pattern = r"\((.*?)\)"
                    try:
                        matched = re.findall(pattern, sentences[key])
                        splits = matched[0].split(' ')
                        splitjoin = ' '.join(splits[4:])
                        answers[key] = splitjoin
                        dieconcept = 1
                    except:
                        pass
                    scores[key] += 50
                    
        if dieconcept == 0:
            for key in range(0,len(sentences)):
                for j in births:
                    if j in sentences[key]:
                        pattern = r"\((.*?)\)"
                        try:
                            matched = re.findall(pattern, sentences[key])
                            splits = matched[0].split(' ')
                            splitjoin = ' '.join(splits)
                            answers[key] = splitjoin
                            dieconcept = 1
                        except:
                            pass
                        scores[key] += 10"""

    results = zip(sentences, articles, scores)
    sortedResults = sorted(results, key = lambda x: x[2])

    return reversed(sortedResults)


"""# Build a deeeper NLP pipeline for better results
def deeperPipeline(NEhints, WNfeatures, question, queType):
    helpers = ""
    
    # We can obtain some domain knowledge for person, organization, location
    # For some random organization
    if 'Apple' in keywords:
        helpers += " Apple inc. computer apple Apple Inc. "
    if 'die' in keywords:
        helpers += ' assassination '
    if 'born' in keywords:
        helpers += ' life '
    
    month = " january jan february feb march mar april apr may june jun july jul august aug september sep october oct november nov december dec "
    year = " 1969 1970 1971 1980 1981 1982 1983 1984 1975 1976 1977 1978 1979 2001 2002 2003 2004 2005 2006 2007 2010 2012 2013 2014 2015"
    # where locations
    if ques_type[0].lower() == 'who':
        if 'found' in question:
        print("Who question")
        
    if ques_type[0].lower() == 'when':
        print("When question")
        
    if ques_type[0].lower() == 'where':
        if 'birth' in question:
            heuristics += ' born '
        heuristics += ' locate '
        print("Where question")

    return heuristics
"""


'# Build a deeeper NLP pipeline for better results\ndef deeperPipeline(NEhints, WNfeatures, question, queType):\n    helpers = ""\n    \n    # We can obtain some domain knowledge for person, organization, location\n    # For some random organization\n    if \'Apple\' in keywords:\n        helpers += " Apple inc. computer apple Apple Inc. "\n    if \'die\' in keywords:\n        helpers += \' assassination \'\n    if \'born\' in keywords:\n        helpers += \' life \'\n    \n    month = " january jan february feb march mar april apr may june jun july jul august aug september sep october oct november nov december dec "\n    year = " 1969 1970 1971 1980 1981 1982 1983 1984 1975 1976 1977 1978 1979 2001 2002 2003 2004 2005 2006 2007 2010 2012 2013 2014 2015"\n    # where locations\n    if ques_type[0].lower() == \'who\':\n        if \'found\' in question:\n        print("Who question")\n        \n    if ques_type[0].lower() == \'when\':\n        print("When question")\n        \n    if que

In [49]:
# Obtains contents from validation set & returns list of questions
def getValidationData():
    valFile = open("Validation-Data.txt", encoding='UTF-8')
    valData = valFile.read()
    valData = valData.strip()
    valList = valData.split("\n")
    
    totalQue = []
    totalAns = []
    
    for articleQueList in valList:
        queList = articleQueList.split("]]")
        questions = ast.literal_eval(queList[0] + "]]")
        
        for QApair in questions[1]:
            question = re.sub('\?', '', QApair[0])
            totalQue.append(question)
            answer = QApair[1]
            totalAns.append(answer)
 
    return totalQue, totalAns

In [70]:
# Obtains best possible answer for the query
def getAnswer(question):
    count = 0
    data_dict = {}

    # Pass the question through NLP pipeline
    count, queFromPipeline = NLP_Pipeline(question.lower(), count, data_dict, None)

    # Obtain features of the question which already passed through the NLP pipeline
    NEhints, WNfeatures, queType, lemmas, stems, dependencyList = questionFeatures(queFromPipeline)

    # Form a query string with best possible features for reliable answers
    queryStr = NEhints + " " +' '.join(WNfeatures) + " " + ' '.join(lemmas) +  " " +' '.join(stems)

    # Run the match query against indexed articles and obtain matched sentences
    sentences, scores, depParses, articles, NEs = GetMatchedSentences(queryStr, dependencyList)
    #print(articles)

    # Obtain only the relevant sentences
    relevantSentences = FindScore(queType, NEhints, sentences, scores, depParses, articles, NEs, dependencyList)
    #print(tuple(relevantSentences))

    # answer = relevantSentences[0][0]
    # article = relevantSentences[0][1]
    answer_candidates = []
    article_candidates = []

    for ans in relevantSentences:
        answer_candidates.append(ans[0])
        article_candidates.append(ans[1])

    # Result sentence
    # candidates = [ans[0] for ans in relevantSentences]
    # article_candidates = [ans[1] for ans in relevantSentences]
    answer = None if len(answer_candidates) == 0 else answer_candidates[0]
    article = None if len(article_candidates) == 0 else article_candidates[0]
    #print(answer)
    
    return answer, article


In [75]:
# Runs the pipeline on the validation set and obtains accuracy
def validateAndGetAccuracy():
    questions, answers = getValidationData()
    total = len(questions)
    correct = 0
    idx = 1

    wb = Workbook()
    sheet1 = wb.add_sheet('Sheet 1')

    sheet1.write(0, 0, 'index')
    sheet1.write(0, 1, 'question')
    sheet1.write(0, 2, 'answer_sentence')
    sheet1.write(0, 3, 'document_id')

    
    for que, expectedAns in zip(questions, answers):
        #print("\n", que)
            
        obtainedAns, obtainedArticle = getAnswer(que)
        #print(obtainedAns)
        #print(obtainedArticle)
        #print(expectedAns)
            
        if obtainedAns is None:
            continue
                
        elif expectedAns in obtainedAns:
            correct += 1
            
        sheet1.write(idx, 0, str(idx))
        sheet1.write(idx, 1, que)
        sheet1.write(idx, 2, obtainedAns)
        sheet1.write(idx, 3, obtainedArticle)
        # Tracks how many questions are completed & prints status for every 250 questions
        if idx % 250 == 0:
            print("Completed answering", idx, "questions in Validation Data")
        idx += 1

    wb.save('output.xls')
    errors = total - correct
    accuracy = (correct / total) * 100
    print("Correct: ", correct, "\t Total: ", total, "\t Incorrect: ", errors)
    print("Validation Accuracy: ", round(accuracy, 2), "%")

In [76]:
validateAndGetAccuracy()

Completed answering 250 questions in Validation Data
Completed answering 500 questions in Validation Data
Completed answering 750 questions in Validation Data
Completed answering 1000 questions in Validation Data
Completed answering 1250 questions in Validation Data
Completed answering 1500 questions in Validation Data
Completed answering 1750 questions in Validation Data
Completed answering 2000 questions in Validation Data
Completed answering 2250 questions in Validation Data
Completed answering 2500 questions in Validation Data
Correct:  1222 	 Total:  2505 	 Incorrect:  1283
Validation Accuracy:  48.78 %


In [None]:
getAnswer("What is the Leader of the Revolution also known as in Iran?")

In [None]:
""" ------------------------ TASK 3 ------------------------ """
# Reads content from the input file using fileName & returns questions
# It considers the relative path to be in the same location as this ipynb
def readInput(fileName):
    fName = fileName + ".txt"
    
    inputData = open(fName).read()
    inputData = inputData.strip()
    questions = inputData.splitlines()
   
    return questions


# Produces output in the required format & save as .csv

