In [2]:
# Turn off unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# Import all the required packages
import json
import nltk
import urllib
import en_core_web_sm
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords


# Performs Word tokenization on sentences
def Tokenization(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens


# Performs Word Lemmatization
def Lemmatization(word_tokens):
    lemmas = []
    for token in word_tokens:
        lemmas.append(wordnet_lemmatizer.lemmatize(token))
    return lemmas


# Performs POS tagging
def POSTagging(word_tokens):
    POStags = nltk.pos_tag(word_tokens)
    return POStags   


# Obtains sentence heads
def getHeads(sentence, word_tokens):
    # Set up dependency parser
    dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
    headList = []
    
    # Split the sentence
    stripedSen = sentence.strip(" '\"")
    if stripedSen != "":
        # Perform dependency parse
        depParse = dependencyParser.raw_parse(stripedSen)
        parseTree = list(depParse)[0]
        headWord = ""
        headWord = [k["word"] for k in parseTree.nodes.values() if k["head"] == 0][0]
        
        # Appends head if it's not empty
        if headWord != "":
            headList.append([headWord])
            
        # Obtain head word based on two cases
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    headList.append([word_tokens[i]])
                    break
            if headWord == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        headList.append([word_tokens[i]])
                        break
                        
    # For empty sentence, we just append "" as head
    else:
        headList.append([""])
 
    return headList


# Obtains WordNet Features
def WordNetFeatures(sentence, word_tokens):
    # Creates dictionaries for important word senses
    hypernyms_list = []
    hyponyms_list = []
    meronyms_list = []
    holonyms_list = []
    synonyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        
        if best_sense is not None:
            # Obtains Synonyms
            synonym = token
            if best_sense. lemmas()[0].name() != token:
                synonym = best_sense.lemmas()[0].name()
            synonyms_list.append(synonym)
            
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
        
            # Obtains Hyponyms
            if best_sense.hyponyms() != []:
                hyponyms_list.append(best_sense.hyponyms()[0].lemmas()[0].name())
            
            # Obtains Meronyms
            if best_sense.part_meronyms() != []:
                meronyms_list.append(best_sense.part_meronyms()[0].lemmas()[0].name())
                
            # Obtains Holonyms
            if best_sense.part_holonyms() != []:
                holonyms_list.append(best_sense.part_holonyms()[0].lemmas()[0].name())
          
        # When there's no best sense, the token itself is the Synonym
        else:
            synonyms_list.append(token)
            
    return hypernyms_list, hyponyms_list, meronyms_list, holonyms_list, synonyms_list
   
    
# Performs Dependency Parsing
def DependencyParsing(sentence):
    dependencyParser = CoreNLPDependencyParser(url='http://localhost:9000')
    parse, = dependencyParser.raw_parse(sentence)
    
    # Dependency parsing to parse tree based patterns as features
    depParseResult = list(parse.triples())
    
    return depParseResult
    
    
# Main method
if __name__ == "__main__":
    # List of all article names in the repository
    articleNames = ["109.txt", "111.txt", "151.txt", "160.txt", "177.txt", 
                    "179.txt","181.txt", "196.txt", "199.txt", "220.txt", 
                    "222.txt", "226.txt", "247.txt", "273.txt", "281.txt", 
                    "282.txt", "285.txt", "287.txt", "288.txt", "297.txt", 
                    "304.txt", "342.txt", "347.txt", "360.txt", "390.txt", 
                    "400.txt", "428.txt", "56.txt", "58.txt", "6.txt"] 
    fileCount = len(articleNames)
    
    content = ""
    folderPath = "https://raw.githubusercontent.com/SaiManasaVedantam/NLP-QA-System-Datasets/main/Articles/"
    for i in range(fileCount):
        print("Started Processing File : " + articleNames[i])
        fileName = folderPath + articleNames[i]
        response = urllib.request.urlopen(fileName)
        webContents = response.read()
        stringTypeData = webContents.decode("utf-8")
        content = stringTypeData
        count = 0
        corpus_dict = {}

        # Obtain wordnet lemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()

        # Get tokenized content
        sentences = []
        tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
        sentences.extend(tokenizer.tokenize(content))

        # Sentence count
        #print("Total Sentences After splitting the document: ", len(sentences))
        print("Extracting features for each sentence in the file...")
    
        # Extracting words
        for sen in sentences:
            print("\n------SENTENCE------")
            print(sen)

            word_tokens = Tokenization(sen)
            #print("\nWord Tokenization : Done")
            #print(word_tokens)

            word_lemmas = Lemmatization(word_tokens)
            #print("Word Lemmatization : Done")
            #print(word_lemmas)

            word_POStags = POSTagging(word_tokens)
            #print("POS Tagging : Done")
            #print(word_POStags)

            hypernyms, hyponyms, meronyms, holonyms, synonyms = WordNetFeatures(sen, word_tokens)
            #print("WordNet Feature Extraction : Done")
            #print(holonyms)
            
            depParse = DependencyParsing(sen)
            #print("Dependency Parsing : Done")
            #print(depParse)

            headList = getHeads(sen, word_tokens)
            #print("Obtaining Heads : Done")
            #print(headList)

            # Process data format to suit the Elastic Search requirements
            count = count + 1
            corpus_dict[count] = {}
            
            corpus_dict[count]["sentence"] = {}
            corpus_dict[count]["sentence"] = sen
            
            corpus_dict[count]["tokenized_text"] = {}
            corpus_dict[count]["tokenized_text"] = word_tokens
            
            corpus_dict[count]["lemma"] = {}
            corpus_dict[count]["lemma"] = word_lemmas
            
            corpus_dict[count]["tagged"] = {}
            corpus_dict[count]["tagged"] = word_POStags
            
            corpus_dict[count]["dependency_parse"] = {}
            corpus_dict[count]["dependency_parse"] = depParse
            
            corpus_dict[count]["synonyms"] = {}
            corpus_dict[count]["synonyms"] = synonyms
            
            corpus_dict[count]["hypernyms"] = {}
            corpus_dict[count]["hypernyms"] = hypernyms
            
            corpus_dict[count]["hyponyms"] = {}
            corpus_dict[count]["hyponyms"] = hyponyms
            
            corpus_dict[count]["meronyms"] = {}
            corpus_dict[count]["meronyms"] = meronyms
            
            corpus_dict[count]["holonyms"] = {}
            corpus_dict[count]["holonyms"] = holonyms
            
            corpus_dict[count]["head_word"] = {}
            corpus_dict[count]["head_word"] = headList[0]
            
            corpus_dict[count]["file_name"] = {}
            corpus_dict[count]["file_name"] = articleNames[i]

        output_name = '../Pipeline-Output/Parsed-' + articleNames[i]
        with open(output_name, 'w+', encoding='utf8') as output_file:
            json.dump(corpus_dict, output_file,  indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False)
        
        print("Completed Processing File : " + articleNames[i])
        
    print("Task 1 Successfully Completed !!!")

Started Processing File 109.txt
Total Sentences After splitting the document:  204
Extracting features for each of the sentences and shown below:

------SENTENCE------
Bird migration is the regular seasonal movement, often north and south along a flyway, between breeding and wintering grounds.

------SENTENCE------
Many species of bird migrate.

------SENTENCE------
Migration carries high costs in predation and mortality, including from hunting by humans, and is driven primarily by availability of food.

------SENTENCE------
It occurs mainly in the northern hemisphere, where birds are funnelled on to specific routes by natural barriers such as the Mediterranean Sea or the Caribbean Sea.

------SENTENCE------
Historically, migration has been recorded as much as 3,000 years ago by Ancient Greek authors including Homer and Aristotle, and in the Book of Job, for species such as storks, turtle doves, and swallows.

------SENTENCE------
More recently, Johannes Leche began recording dates of 


------SENTENCE------
Most migrations begin with the birds starting off in a broad front.

------SENTENCE------
Often, this front narrows into one or more preferred routes termed flyways.

------SENTENCE------
These routes typically follow mountain ranges or coastlines, sometimes rivers, and may take advantage of updrafts and other wind patterns or avoid geographical barriers such as large stretches of open water.

------SENTENCE------
The specific routes may be genetically programmed or learned to varying degrees.

------SENTENCE------
The routes taken on forward and return migration are often different.

------SENTENCE------
A common pattern in North America is clockwise migration, where birds flying North tend to be further West, and flying South tend to shift Eastwards.

------SENTENCE------
Many, if not most, birds migrate in flocks.

------SENTENCE------
For larger birds, flying in flocks reduces the energy cost.

------SENTENCE------
Geese in a V-formation may conserve 12–20% of


------SENTENCE------
Massive numbers of large raptors and storks pass through areas such as the Strait of Messina, Gibraltar, Falsterbo, and the Bosphorus at migration times.

------SENTENCE------
More common species, such as the European honey buzzard Pernis apivorus, can be counted in hundreds of thousands in autumn.

------SENTENCE------
Other barriers, such as mountain ranges, can also cause funnelling, particularly of large diurnal migrants.

------SENTENCE------
This is a notable factor in the Central American migratory bottleneck.

------SENTENCE------
Batumi bottleneck in the Caucasus is one of the heaviest migratory funnels on earth.

------SENTENCE------
Avoiding flying over the Black Sea surface and across high mountains, hundreds of thousands of soaring birds funnel through an area around the city of Batumi, Georgia.

------SENTENCE------
Birds of prey such as honey buzzards which migrate using thermals lose only 10 to 20% of their weight during migration, which may explai


------SENTENCE------
At this stage the bird is in the position of a boy scout with a compass but no map, until it grows accustomed to the journey and can put its other capabilities to use.

------SENTENCE------
With experience it learns various landmarks and this "mapping" is done by magnetites in the trigeminal system, which tell the bird how strong the field is.

------SENTENCE------
Because birds migrate between northern and southern regions, the magnetic field strengths at different latitudes let it interpret the radical pair mechanism more accurately and let it know when it has reached its destination.

------SENTENCE------
There is a neural connection between the eye and "Cluster N", the part of the forebrain that is active during migrational orientation, suggesting that birds may actually be able to see the magnetic field of the earth.

------SENTENCE------
Migrating birds can lose their way and appear outside their normal ranges.

------SENTENCE------
This can be due to flying


------SENTENCE------
Prior to the designation of immunity from the etymological root immunis, which is Latin for "exempt"; early physicians characterized organs that would later be proven as essential components of the immune system.

------SENTENCE------
The important lymphoid organs of the immune system are the thymus and bone marrow, and chief lymphatic tissues such as spleen, tonsils, lymph vessels, lymph nodes, adenoids, and liver.

------SENTENCE------
When health conditions worsen to emergency status, portions of immune system organs including the thymus, spleen, bone marrow, lymph nodes and other lymphatic tissues can be surgically excised for examination while patients are still alive.

------SENTENCE------
Many components of the immune system are typically cellular in nature and not associated with any specific organ; but rather are embedded or circulating in various tissues located throughout the body.

------SENTENCE------
Classical immunology ties in with the fields of ep


------SENTENCE------
The theory was later modified to reflect new discoveries regarding histocompatibility or the complex "two-signal" activation of T cells.

------SENTENCE------
The self/nonself theory of immunity and the self/nonself vocabulary have been criticized, but remain very influential.

------SENTENCE------
Bioscience is the overall major in which undergraduate students who are interested in general well-being take in college.

------SENTENCE------
Immunology is a branch of bioscience for undergraduate programs but the major gets specified as students move on for graduate program in immunology.

------SENTENCE------
The aim of immunology is to study the health of humans and animals through effective yet consistent research, (AAAAI, 2013).

------SENTENCE------
The most important thing about being immunologists is the research because it is the biggest portion of their jobs.

------SENTENCE------
Most graduate immunology schools follow the AAI courses immunology which are o


------SENTENCE------
Created in the 1920s, the UCR system has not proven to be as uniform as its name implies.

------SENTENCE------
The UCR data only reflect the most serious offense in the case of connected crimes and has a very restrictive definition of rape.

------SENTENCE------
Since about 93% of the data submitted to the FBI is in this format, the UCR stands out as the publication of choice as most states require law enforcement agencies to submit this data.

------SENTENCE------
FBI records show that 85% of COINTELPRO resources targeted groups and individuals that the FBI deemed "subversive", including communist and socialist organizations; organizations and individuals associated with the Civil Rights Movement, including Martin Luther King, Jr. and others associated with the Southern Christian Leadership Conference, the National Association for the Advancement of Colored People, and the Congress of Racial Equality and other civil rights organizations; black nationalist groups


------SENTENCE------
The Post reported from Zegart's book that government documents show the CIA and FBI missed 23 potential chances to disrupt the terrorist attacks of September 11, 2001.

------SENTENCE------
The primary reasons for the failures included: agency cultures resistant to change and new ideas; inappropriate incentives for promotion; and a lack of cooperation between the FBI, CIA and the rest of the United States Intelligence Community.

------SENTENCE------
The book blamed the FBI's decentralized structure, which prevented effective communication and cooperation among different FBI offices.

------SENTENCE------
The book suggested that the FBI has not evolved into an effective counter-terrorism or counter-intelligence agency, due in large part to deeply ingrained agency cultural resistance to change.

------SENTENCE------
For example, FBI personnel practices continue to treat all staff other than special agents as support staff, classifying intelligence analysts alongsid


------SENTENCE------
The database cross-referenced logs of Wikipedia edits with publicly available records pertaining to the internet IP addresses edits were made from.

------SENTENCE------
Griffith was motivated by the edits from the United States Congress, and wanted to see if others were similarly promoting themselves.

------SENTENCE------
The tool was designed to detect conflict of interest edits.

------SENTENCE------
Among his findings were that FBI computers were used to edit the FBI article in Wikipedia.

------SENTENCE------
Although the edits correlated with known FBI IP addresses, there was no proof that the changes actually came from a member or employee of the FBI, only that someone who had access to their network had edited the FBI article in Wikipedia.

------SENTENCE------
Wikipedia spokespersons received Griffith's "WikiScanner" positively, noting that it helped prevent conflicts of interest from influencing articles as well as increasing transparency and mitigating


------SENTENCE------
The FBI is also charged with the responsibility of enforcing compliance of the United States Civil Rights Act of 1964 and investigating violations of the act in addition to prosecuting such violations with the United States Department of Justice (DOJ).

------SENTENCE------
The FBI also shares concurrent jurisdiction with the Drug Enforcement Administration (DEA) in the enforcement of the Controlled Substances Act of 1970.

------SENTENCE------
The FBI often works in conjunction with other Federal agencies, including the U.S. Coast Guard (USCG) and U.S. Customs and Border Protection (CBP) in seaport and airport security, and the National Transportation Safety Board in investigating airplane crashes and other critical incidents.


KeyboardInterrupt: 