In [5]:
# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import json
import nltk
import urllib
import en_core_web_sm
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords


# Performs Word tokenization on sentences
def Tokenization(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens


# Performs Word Lemmatization
def Lemmatization(word_tokens):
    lemmas = []
    for token in word_tokens:
        lemmas.append(wordnet_lemmatizer.lemmatize(token))
    return lemmas


# Performs POS tagging
def POSTagging(word_tokens):
    POStags = nltk.pos_tag(word_tokens)
    return POStags   


# Obtains sentence heads
def getHeads(sentence, word_tokens):
    # Set up dependency parser
    dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
    headList = []
    
    # Split the sentence
    stripedSen = sentence.strip(" '\"")
    if stripedSen != "":
        # Perform dependency parse
        depParse = dependencyParser.raw_parse(stripedSen)
        parseTree = list(depParse)[0]
        headWord = ""
        headWord = [k["word"] for k in parseTree.nodes.values() if k["head"] == 0][0]
        
        # Appends head if it's not empty
        if headWord != "":
            headList.append([headWord])
            
        # Obtain head word based on two cases
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    headList.append([word_tokens[i]])
                    break
            if headWord == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        headList.append([word_tokens[i]])
                        break
                        
    # For empty sentence, we just append "" as head
    else:
        headList.append([""])
 
    return headList


# Obtains WordNet Features
def WordNetFeatures(sentence, word_tokens):
    # Creates dictionaries for important word senses
    hypernyms_list = []
    hyponyms_list = []
    meronyms_list = []
    holonyms_list = []
    synonyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        
        if best_sense is not None:
            # Obtains Synonyms
            synonym = token
            if best_sense. lemmas()[0].name() != token:
                synonym = best_sense.lemmas()[0].name()
            synonyms_list.append(synonym)
            
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
        
            # Obtains Hyponyms
            if best_sense.hyponyms() != []:
                hyponyms_list.append(best_sense.hyponyms()[0].lemmas()[0].name())
            
            # Obtains Meronyms
            if best_sense.part_meronyms() != []:
                meronyms_list.append(best_sense.part_meronyms()[0].lemmas()[0].name())
                
            # Obtains Holonyms
            if best_sense.part_holonyms() != []:
                holonyms_list.append(best_sense.part_holonyms()[0].lemmas()[0].name())
          
        # When there's no best sense, the token itself is the Synonym
        else:
            synonyms_list.append(token)
            
    return hypernyms_list, hyponyms_list, meronyms_list, holonyms_list, synonyms_list
   
    
# Performs Dependency Parsing
def DependencyParsing(sentence):
    dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
    parse, = dependencyParser.raw_parse(sentence)
    
    # Dependency parsing to parse tree based patterns as features
    depParseResult = list(parse.triples())
    
    return depParseResult
    
    
# Main method
if __name__ == "__main__":
    # List of all article names in the repository
    articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                    "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                    "222.txt", "226.txt", "288.txt", "297.txt", "304.txt", 
                    "342.txt", "347.txt", "360.txt", "390.txt", "400.txt", 
                    "56.txt", "58.txt", "6.txt"] 
    fileCount = len(articleNames)
    
    content = ""
    folderPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Articles/"
    for i in range(fileCount):
        fileName = folderPath + articleNames[i]
        response = urllib.request.urlopen(fileName)
        webContents = response.read()
        stringTypeData = webContents.decode("utf-8")
        content = stringTypeData
        count = 0
        corpus_dict = {}

        # Obtain wordnet lemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()

        # Get tokenized content
        sentences = []
        tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
        sentences.extend(tokenizer.tokenize(content))

        # Sentence count
        print("Total Sentences After splitting the document: ", len(sentences))
        print("Extracting features for each of the sentences and shown below:")
    
        # Extracting words
        for sen in sentences:
            print("\n------SENTENCE------")
            print(sen)

            print("\nWord Tokenization : Done")
            word_tokens = Tokenization(sen)
            #print(word_tokens)

            print("Word Lemmatization : Done")
            word_lemmas = Lemmatization(word_tokens)
            #print(word_lemmas)

            print("POS Tagging : Done")
            word_POStags = POSTagging(word_tokens)
            #print(word_POStags)

            print("WordNet Feature Extraction : Done")
            hypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sen, word_tokens)
            #print(holonyms)
            
            print("Dependency Parsing : Done")
            depParse = DependencyParsing(sen)
            #print(depParse)

            print("Obtaining Heads : Done")
            headList = getHeads(sen, word_tokens)
            #print(headList)

            # Process data format to suit the Elastic Search requirements
            count = count + 1
            corpus_dict[count] = {}
            
            corpus_dict[count]["sentence"] = {}
            corpus_dict[count]["sentence"] = sen
            
            corpus_dict[count]["tokenized_text"] = {}
            corpus_dict[count]["tokenized_text"] = word_tokens
            
            corpus_dict[count]["lemma"] = {}
            corpus_dict[count]["lemma"] = word_lemmas
            
            corpus_dict[count]["tagged"] = {}
            corpus_dict[count]["tagged"] = word_POStags
            
            corpus_dict[count]["dependency_parse"] = {}
            corpus_dict[count]["dependency_parse"] = depParse
            
            corpus_dict[count]["synonyms"] = {}
            corpus_dict[count]["synonyms"] = synonyms
            
            corpus_dict[count]["hypernyms"] = {}
            corpus_dict[count]["hypernyms"] = hypernyms
            
            corpus_dict[count]["hyponyms"] = {}
            corpus_dict[count]["hyponyms"] = hyponyms
            
            corpus_dict[count]["meronyms"] = {}
            corpus_dict[count]["meronyms"] = meronyms
            
            corpus_dict[count]["holonyms"] = {}
            corpus_dict[count]["holonyms"] = holonyms
            
            corpus_dict[count]["head_word"] = {}
            corpus_dict[count]["head_word"] = headList[0]
            
            corpus_dict[count]["file_name"] = {}
            corpus_dict[count]["file_name"] = articleNames[i]

        output_name = '../Pipeline-Output/Parsed-' + articleNames[i]
        
        with open(output_name, 'w+', encoding='utf8') as output_file:
            json.dump(corpus_dict, output_file,  indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False)

Total Sentences After splitting the document:  204
Extracting features for each of the sentences and shown below:

------SENTENCE------
Bird migration is the regular seasonal movement, often north and south along a flyway, between breeding and wintering grounds.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Many species of bird migrate.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Migration carries high costs in predation and mortality, including from hunting by humans, and is driven primarily by availability of food.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
It occurs mainly in the northern hemisp

Obtaining Heads : Done

------SENTENCE------
Many bird populations migrate long distances along a flyway.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
The most common pattern involves flying north in the spring to breed in the temperate or Arctic summer and returning in the autumn to wintering grounds in warmer regions to the south.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Of course, in the southern hemisphere the directions are reversed, but there is less land area in the far south to support long-distance migration.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
The primary motivation for migrati


------SENTENCE------
Red knots Calidris canutus and dunlins Calidris alpina were found in radar studies to fly 5 km/h (3.1 mph) faster in flocks than when they were flying alone.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Birds fly at varying altitudes during migration.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
An expedition to Mt.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Everest found skeletons of northern pintail Anas acuta and black-tailed godwit Limosa limosa at 5,000 m (16,000 ft) on the Khumbu Glacier.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feat

Obtaining Heads : Done

------SENTENCE------
For some species of waders, migration success depends on the availability of certain key food resources at stopover points along the migration route.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
This gives the migrants an opportunity to refuel for the next leg of the voyage.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Some examples of important stopover locations are the Bay of Fundy and Delaware Bay.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Some bar-tailed godwits Limosa lapponica have the longest known non-stop flight of any migrant, flying 11,000 k

Obtaining Heads : Done

------SENTENCE------
They land in the morning and may feed for a few days before resuming their migration.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
The birds are referred to as passage migrants in the regions where they occur for short durations between the origin and destination.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Nocturnal migrants minimize predation, avoid overheating, and can feed during the day.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
One cost of nocturnal migration is the loss of sleep.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : 

Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
This is termed protandry.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Navigation is based on a variety of senses.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Many birds have been shown to use a sun compass.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Using the sun for direction involves the need for making compensation based on the time.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Navigation has al

Obtaining Heads : Done

------SENTENCE------
The timing of this molt - usually once a year but sometimes twice - varies with some species molting prior to moving to their winter grounds and others molting prior to returning to their breeding grounds.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Apart from physiological adaptations, migration sometimes requires behavioural changes such as flying in flocks to reduce the energy used in migration or the risk of predation.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Migration in birds is highly labile and is believed to have developed independently in many avian lineages.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Depende


------SENTENCE------
Birds were last seen in their favourite wintering grounds in Keoladeo National Park in 2002.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Structures such as power lines, wind farms and offshore oil-rigs have also been known to affect migratory birds.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
Other migration hazards include pollution, storms, wildfires, and habitat destruction along migration routes, denying migrants food at stopover points.

Word Tokenization : Done
Word Lemmatization : Done
POS Tagging : Done
WordNet Feature Extraction : Done
Dependency Parsing : Done
Obtaining Heads : Done

------SENTENCE------
For example, in the East Asian–Australasian Flyway, up to 65% of key intertidal habitat at t