In [None]:
# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import json
import nltk
import string
import urllib
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
from nltk.parse import CoreNLPParser
from nltk.corpus import stopwords
from nltk.parse.corenlp import CoreNLPDependencyParser

# Start common things globally
stop_words = stopwords.words('english') + list(string.punctuation)
dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
namedEntityTagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')

# Performs Word tokenization on sentences
def Tokenization(sentence):
    tokens = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]
    return tokens


# Performs Word Lemmatization
def Lemmatization(word_tokens):
    lemmas = []
    for token in word_tokens:
        lemmas.append(wordnet_lemmatizer.lemmatize(token))
    return lemmas


# Performs POS tagging
def POSTagging(word_tokens):
    POStags = nltk.pos_tag(word_tokens)
    return POStags   


# Obtains sentence heads
def getHeads(sentence, word_tokens):
    # Create a head list to add the heads
    headList = []
    
    # Split the sentence
    stripedSen = sentence.strip(" '\"")
    if stripedSen != "":
        # Perform dependency parse
        depParse = dependencyParser.raw_parse(stripedSen)
        parseTree = list(depParse)[0]
        headWord = ""
        headWord = [k["word"] for k in parseTree.nodes.values() if k["head"] == 0][0]
        
        # Appends head if it's not empty
        if headWord != "":
            headList.append([headWord])
            
        # Obtain head word based on two cases
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    headList.append([word_tokens[i]])
                    break
            if headWord == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        headList.append([word_tokens[i]])
                        break
                        
    # For empty sentence, we just append "" as head
    else:
        headList.append([""])
 
    return headList


# Obtains WordNet Features
def WordNetFeatures(sentence, word_tokens):
    # Creates dictionaries for important word senses
    hypernyms_list = []
    hyponyms_list = []
    meronyms_list = []
    holonyms_list = []
    synonyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        
        if best_sense is not None:
            # Obtains Synonyms
            synonym = token
            if best_sense. lemmas()[0].name() != token:
                synonym = best_sense.lemmas()[0].name()
            synonyms_list.append(synonym)
            
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
        
            # Obtains Hyponyms
            if best_sense.hyponyms() != []:
                hyponyms_list.append(best_sense.hyponyms()[0].lemmas()[0].name())
            
            # Obtains Meronyms
            if best_sense.part_meronyms() != []:
                meronyms_list.append(best_sense.part_meronyms()[0].lemmas()[0].name())
                
            # Obtains Holonyms
            if best_sense.part_holonyms() != []:
                holonyms_list.append(best_sense.part_holonyms()[0].lemmas()[0].name())
          
        # When there's no best sense, the token itself is the Synonym
        else:
            synonyms_list.append(token)
            
    return hypernyms_list, hyponyms_list, meronyms_list, holonyms_list, synonyms_list
   
    
# Performs Dependency Parsing
def DependencyParsing(sentence):
    # Perform dependency parsing
    parse, = dependencyParser.raw_parse(sentence)
    
    # Dependency parsing to parse tree based patterns as features
    depParseResult = list(parse.triples())
    
    return depParseResult
    
    
# Obtains Named Entities
def NamedEntities(sentence, tokens):
    # Word tokenize again and use them if NEs are present
    namedTokens = nltk.word_tokenize(sentence)
    NEtags = None
    
    try:
        NEtags = namedEntityTagger.tag(namedTokens)
    except:
        NEtags = namedEntityTagger.tag(tokens)
        
    return NEtags

# NLP pipeline through which all the articles & question will pass
def NLP_Pipeline(sentence, count, corpus_dict, articleName = None, queType = None):
    #print("\n------SENTENCE------")
    #print(sen)

    word_tokens = Tokenization(sentence)
    #print("\nWord Tokenization : Done")
    #print(word_tokens)

    word_NEtags = NamedEntities(sentence, word_tokens)
    #print("\nNamed Entity Tagging : Done")
    #print(word_NEtags)
    
    word_lemmas = Lemmatization(word_tokens)
    #print("Word Lemmatization : Done")
    #print(word_lemmas)

    word_POStags = POSTagging(word_tokens)
    #print("POS Tagging : Done")
    #print(word_POStags)

    hypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sentence, word_tokens)
    #print("WordNet Feature Extraction : Done")
    #print(holonyms)
            
    depParse = DependencyParsing(sentence)
    #print("Dependency Parsing : Done")
    #print(depParse)

    headList = getHeads(sentence, word_tokens)
    #print("Obtaining Heads : Done")
    #print(headList)

    # Process data format to suit the Elastic Search requirements
    count = count + 1
    corpus_dict[count] = {}
            
    corpus_dict[count]["sentence"] = {}
    corpus_dict[count]["sentence"] = sentence
            
    corpus_dict[count]["tokenized_text"] = {}
    corpus_dict[count]["tokenized_text"] = word_tokens
            
    corpus_dict[count]["lemma"] = {}
    corpus_dict[count]["lemma"] = word_lemmas
    
    corpus_dict[count]["ner_tag"] = {}
    corpus_dict[count]["ner_tag"] = str(dict(word_NEtags))
            
    corpus_dict[count]["tagged"] = {}
    corpus_dict[count]["tagged"] = word_POStags
            
    corpus_dict[count]["dependency_parse"] = {}
    corpus_dict[count]["dependency_parse"] = depParse
            
    corpus_dict[count]["synonyms"] = {}
    corpus_dict[count]["synonyms"] = synonyms
            
    corpus_dict[count]["hypernyms"] = {}
    corpus_dict[count]["hypernyms"] = hypernyms
            
    corpus_dict[count]["hyponyms"] = {}
    corpus_dict[count]["hyponyms"] = hyponyms
            
    corpus_dict[count]["meronyms"] = {}
    corpus_dict[count]["meronyms"] = meronyms
            
    corpus_dict[count]["holonyms"] = {}
    corpus_dict[count]["holonyms"] = holonyms
            
    corpus_dict[count]["head_word"] = {}
    corpus_dict[count]["head_word"] = headList[0]
    
    # For question, we don't have the article name and then it will have a questionType
    if articleName is not None:
        corpus_dict[count]["file_name"] = {}
        corpus_dict[count]["file_name"] = articleName
        
        
    # For question, we should add the question type
    else:
        corpus_dict[count]["type_of_question"] = {}
        corpus_dict[count]["type_of_question"] = queType
    
    return count, corpus_dict
    
    
# Main method
if __name__ == "__main__":
    # List of all article names in the repository
    articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                    "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                    "222.txt", "226.txt", "247.txt", "273.txt", "281.txt", 
                    "282.txt", "285.txt", "287.txt", "288.txt", "297.txt", 
                    "304.txt", "342.txt", "347.txt", "360.txt", "390.txt", 
                    "400.txt", "428.txt", "56.txt", "58.txt", "6.txt"] 
    fileCount = len(articleNames)
    
    content = ""
    folderPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Articles/"
    for i in range(fileCount):
        print("\nStarted Processing File : " + articleNames[i])
        fileName = folderPath + articleNames[i]
        response = urllib.request.urlopen(fileName)
        webContents = response.read()
        stringTypeData = webContents.decode("utf-8")
        content = stringTypeData
        count = 0
        corpus_dict = {}

        # Obtain wordnet lemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()

        # Get tokenized content
        sentences = []
        tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
        sentences.extend(tokenizer.tokenize(content))

        # Sentence count
        #print("Total Sentences After splitting the document: ", len(sentences))
        print("Extracting features for each sentence in the file...")
    
        # Extracting words
        for sen in sentences:
            count, corpus_dict = NLP_Pipeline(sen, count, corpus_dict, articleNames[i])
                
        output_name = '../Pipeline-Output/Parsed-' + articleNames[i]
        with open(output_name, 'w+', encoding='utf8') as output_file:
            json.dump(corpus_dict, output_file,  indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False)
        
        print("Completed Processing File : " + articleNames[i])
        
    print("Task 1 Successfully Completed !!!")
    

In [44]:
# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import ssl
import json
import urllib
import requests
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection

# Main method
if __name__ == "__main__":
    # List of all article names in the repository
    articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                    "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                    "222.txt", "226.txt", "247.txt", "273.txt", "281.txt", 
                    "282.txt", "285.txt", "287.txt", "288.txt", "297.txt", 
                    "304.txt", "342.txt", "347.txt", "360.txt", "390.txt", 
                    "400.txt", "428.txt", "56.txt", "58.txt", "6.txt"] 
    fileCount = len(articleNames)
    
    # Setup Elastic Search
    elastic = Elasticsearch([{'host': 'localhost', 'port': 9200, 'use_ssl' : False, 'ssl_verify' : False}], timeout=30, max_retries=10)
    
    # Obtain requests from the page
    req = requests.get("http://localhost:9200", verify=False)
    
    # Use indexing
    idx = 1
    
    content = ""
    folderPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Pipeline-Output/Parsed-"
    for i in range(fileCount):
        print("Started Processing File : " + articleNames[i])
        fileName = folderPath + articleNames[i]
        response = urllib.request.urlopen(fileName)
        webContents = response.read()
        stringTypeData = webContents.decode("utf-8")
        content = stringTypeData
        
        # Obtain Json data from file contents
        jsonFile = json.loads(content)
        
        # Creating new index "articles" for each line in the article
        for key, value in jsonFile.items():
            elastic.index(index = "articles", doc_type = "text", id = idx, body = value)
            # print("Here")
            idx += 1
            
    print("Elastic Search Successfully Completed !!!")
    
    

Started Processing File : 109.txt
Started Processing File : 111.txt
Started Processing File : 151.txt
Started Processing File : 160.txt
Started Processing File : 177.txt
Started Processing File : 179.txt
Started Processing File : 181.txt
Started Processing File : 196.txt
Started Processing File : 199.txt
Started Processing File : 220.txt
Started Processing File : 222.txt
Started Processing File : 226.txt
Started Processing File : 247.txt
Started Processing File : 273.txt
Started Processing File : 281.txt
Started Processing File : 282.txt
Started Processing File : 285.txt
Started Processing File : 287.txt
Started Processing File : 288.txt
Started Processing File : 297.txt
Started Processing File : 304.txt
Started Processing File : 342.txt
Started Processing File : 347.txt
Started Processing File : 360.txt
Started Processing File : 390.txt
Started Processing File : 400.txt
Started Processing File : 428.txt
Started Processing File : 56.txt
Started Processing File : 58.txt
Started Processi

In [49]:
# Obtains features in the question by using the result obtained from the NLP Pipeline
def questionFeatures(question):
    # Get all the wordnet features
    WNfeatures = question[1]['synonyms'] + 
                 question[1]['meronyms'] + 
                 question[1]['hyponyms'] + 
                 question[1]['holonyms'] + 
                 question[1]['hypernyms']
       
    # Create hints for easy search using Named Entities and the Sentence head
    head = question[1]['head_word'][0]
    NEs = question[1]['ner_tag']
    NEhints = ""
    namedEntities = []
    
    for word, entity in NEs.items():
        namedEntities.append(entity)
        if entity == 'ORGANIZATION' or entity == 'LOCATION' or entity == 'PERSON':
            NEhints += " " + word + " "
            
    NEhints += " " + head + " "
    
    # Obtain question type and other features
    queType = question[1]['type_of_question']
    lemmas = question[1]['lemma']
    depParse = question[1]['dependency_parse']

    depList = list(list(x) for x in depParse)
    depElements = []
    
    for i in depList:
        if i[1] == 'nsubj' || i[1] == 'dobj':
            depElements.append(i[0])
     
    # Retrieve main elements from the dependency parse result
    dependencyList = list(list(x) for x in depElements)

    return NEhints, WNfeatures, queType, lemmas, dependencyList


# Check and obtain matched sentences using the query string
def GetMatchedSentences(queryStr, dependencyList):
    querybody = {
        "query": {
            "dis_max": {
                "queries": [
                    # { "match": { "lemma": {"query": spclQuery,"boost": 2}  }},
                    {"multi_match": {'query': queryStr, "fields": [
                        # "lemma^2.0", "synonyms^0.5", "meronyms^0.1", "holonyms^0.1", "hypernyms^0.1", "hyponyms^0.1"]}},
                        "lemma^2", "ner_tag", "synonyms", "meronyms^0.5", "holonyms^0.5", "hypernyms^0.5", "hyponyms^0.5"]}},
                    ]
                }
            }
        }

    result = elastic.search(index = "articles", body=querybody)
    answers = result['hits']['hits']
    depParses, sentences, scores, articles, NEs = [], [], [], [], []
    
    for i in range(len(answers)):
        sentence = result['hits']['hits'][i]['_source']['sentence']
        sentences.append(sentence)
        
        score = result['hits']['hits'][i]['_score']
        scores.append(score)
        
        depParse = result['hits']['hits'][i]['_source']['dependency_parse']
        depParses.append(depParse)
        
        article = result['hits']['hits'][i]['_source']['file_name']
        articles.append(article)
        
        NE = result['hits']['hits'][i]['_source']['ner_tag']
        NEs.append(NE)
        
    return sentences, scores, depParses, articles, NEs


question = "what is ukiyo-e an example of?"
tokens = nltk.word_tokenize(question)

count, queFromPipeline = NLP_Pipeline(question, count = 0, corpus_dict, None)
NEhints, WNfeatures, queType, lemmas, dependencyList = questionFeatures(queFromPipeline)
queryStr = NEhints + " " +' '.join(WNfeatures) + " " + ' '.join(lemmas)
sentences, scores, depParses, articles, NEs = GetMatchedSentences(queryStr, dependencyList)
#relevantSentences = computeScore(queType, NEhints, sentences, scores, depParses, articles, NEs, dependencyList)

for sent in sentences:
    print(sent, "\n")









"""

queType = ""
hintWord = ""
if " what " in input:
    queType="what"
    hintWord="ORGANIZATION"
    
if " who " or " whom " in input:
    queType="who"
    hintWord="PERSON"
    
if " when " in input:
    queType="when"
    hintWord="DATE"


lemmas = Lemmatization(input)
word_tokens = Tokenization(input)
hypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sen, word_tokens)
# theQuery =  keywords + " " +  heuristics +  " " +' '.join(similarWords) + " " + ' '.join(lemma)+  " " +' '.join(stems)
query =  ' '.join(lemmas) + " " + ' '.join(hyponyms)

querybody = {
        "query": {
            "dis_max": {
                "queries": [
                    # { "match": { "lemma": {"query": spclQuery,"boost": 2}  }},
                    {"multi_match": {'query': query, "fields": [
                        # "lemma^2.0", "synonyms^0.5", "meronyms^0.1", "holonyms^0.1", "hypernyms^0.1", "hyponyms^0.1"]}},
                        "lemma^2", "synonyms", "meronyms^0.5", "holonyms^0.5", "hypernyms^0.5", "hyponyms^0.5"]}},
                ]
            }
        }
    }

print(query)
ans2 = elastic.search(index="articles", body=querybody)
answers = ans2['hits']['hits']
"""

{1: {'sentence': 'what is ukiyo-e an example of?', 'tokenized_text': ['ukiyo-e', 'example'], 'lemma': ['ukiyo-e', 'example'], 'ner_tag': "{'what': 'O', 'is': 'O', 'ukiyo': 'O', '-': 'O', 'e': 'O', 'an': 'O', 'example': 'O', 'of': 'O', '?': 'O'}", 'tagged': [('ukiyo-e', 'JJ'), ('example', 'NN')], 'dependency_parse': [(('what', 'WP'), 'cop', ('is', 'VBZ')), (('what', 'WP'), 'nsubj', ('example', 'NN')), (('example', 'NN'), 'dep', ('e', 'NN')), (('e', 'NN'), 'amod', ('ukiyo', 'JJ')), (('e', 'NN'), 'punct', ('-', 'HYPH')), (('example', 'NN'), 'det', ('an', 'DT')), (('example', 'NN'), 'acl', ('of', 'IN')), (('what', 'WP'), 'punct', ('?', '.'))], 'synonyms': ['ukiyo-e', 'model'], 'hypernyms': ['representation'], 'hyponyms': ['lodestar'], 'meronyms': [], 'holonyms': [], 'head_word': ['what']}, 2: {'sentence': 'A botanist or plant scientist is a scientist who specializes in this field.', 'tokenized_text': ['botanist', 'plant', 'scientist', 'scientist', 'specializes', 'field'], 'lemma': ['botani

'\n\nqueType = ""\nhintWord = ""\nif " what " in input:\n    queType="what"\n    hintWord="ORGANIZATION"\n    \nif " who " or " whom " in input:\n    queType="who"\n    hintWord="PERSON"\n    \nif " when " in input:\n    queType="when"\n    hintWord="DATE"\n\n\nlemmas = Lemmatization(input)\nword_tokens = Tokenization(input)\nhypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sen, word_tokens)\n# theQuery =  keywords + " " +  heuristics +  " " +\' \'.join(similarWords) + " " + \' \'.join(lemma)+  " " +\' \'.join(stems)\nquery =  \' \'.join(lemmas) + " " + \' \'.join(hyponyms)\n\nquerybody = {\n        "query": {\n            "dis_max": {\n                "queries": [\n                    # { "match": { "lemma": {"query": spclQuery,"boost": 2}  }},\n                    {"multi_match": {\'query\': query, "fields": [\n                        # "lemma^2.0", "synonyms^0.5", "meronyms^0.1", "holonyms^0.1", "hypernyms^0.1", "hyponyms^0.1"]}},\n                        "lemma^2

In [50]:
sentences = []
scores = []
articles = []
for i in range(len(answers)):
    sent = ans2['hits']['hits'][i]['_source']['sentence']
    score = ans2['hits']['hits'][i]['_score']
    article = ans2['hits']['hits'][i]['_source']['file_name']
    sentences.append(sent)
    scores.append(score)
    articles.append(article)
    
for sent in sentences:
    print(sent, "\n")
    
print(scores)
print(articles)

E, P, D). 

In the mid-1950s Karl H. Beyer, James M. Sprague, John E. Baer, and Frederick C. Novello of Merck and Co. discovered and developed chlorothiazide, which remains the most widely used antihypertensive drug today. 

To the west, E. University Boulevard leads to the Fourth Avenue Shopping District. 

Japanese comics and cartooning (manga),[g] have a history that has been seen as far back as the anthropomorphic characters in the 12th-to-13th-century Chōjū-jinbutsu-giga, 17th-century toba-e and kibyōshi picture books, and woodblock prints such as ukiyo-e which were popular between the 17th and 20th centuries. 

An experimental study by German cognitive psychologists L. Schwabe and O. Wolf demonstrates how learning under stress also decreases memory recall in humans. 

Cartoonists began creating comics for mature audiences, and the term "Ninth Art"[e] was coined, as comics began to attract public and academic attention as an artform. 

Helping his father in Visible Speech demonstr