# Wikipedia-API

In [None]:
%%bash
# install the latest release of Haystack
pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es, print_questions

In [None]:
!pip install -U transformers==3.0.0
!python -m nltk.downloader punkt
!git clone https://github.com/patil-suraj/question_generation.git

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!python -m spacy download en_core_web_sm

In [None]:
!pip install flashtext

In [None]:
!git clone https://github.com/amontgomerie/question_generator/

In [None]:
!pip install Wikipedia-API

In [None]:
import wikipediaapi

In [None]:
wiki_wiki = wikipediaapi.Wikipedia('en')

ml_art = wiki_wiki.page('Machine_Learning')

In [None]:
print("Page - Exists: %s" % ml_art.exists())

In [None]:
print("Page - Title: %s" % ml_art.title)

In [None]:
print("Page - Summary: %s" % ml_art.summary[0:60])

In [None]:
print(ml_art.fullurl)

In [None]:
ml_ftxt = ml_art.text
ml_ftxt

In [None]:
ml_summary = ml_art.summary
ml_summary

# Clean ML Text

In [None]:
import re
import string

In [None]:
ml_ftxt = ml_ftxt.lower()

In [None]:
ml_ftxt = re.sub(r'\n', '', ml_ftxt)

In [None]:
ml_ftxt

In [None]:
ml_ftxt = re.sub(r'\-', '', ml_ftxt)

In [None]:
ml_ftxt

In [None]:
ml_ftxt = re.sub(r'\u200a','', ml_ftxt)

In [None]:

ml_ftxt

In [None]:
# Remove puncuation
my_punctuation = string.punctuation.replace(",", "")
my_punctuation = my_punctuation.replace(".", "")

translator = str.maketrans('', '', my_punctuation)
ml_ftxt = ml_ftxt.translate(translator)

In [None]:

ml_ftxt

In [None]:
# Remove numbers
ml_ftxt = re.sub(r'[0-9]', '', ml_ftxt)

In [None]:
ml_ftxt

In [None]:
ml_ftxt = re.sub(r' - ','', ml_ftxt)

In [None]:
ml_ftxt

# T5 With text Data only

In [None]:
# Max length 265.
txt1 = "machine learning ml is a field of inquiry devoted to understanding and building methods that learn, that is, methods that leverage data to improve performance on some set of tasks. it is seen as a part of artificial intelligence."
txt2 = "the study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning. some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain.  in its application across business problems, machine learning is also referred to as predictive analytics.overviewlearning algorithms work on the basis that strategies, algorithms, and inferences that worked well in the past are likely to continue working well in the future. these inferences can be obvious, such as since the sun rose every morning for the last , days, it will probably rise tomorrow morning as well. they can be nuanced, such as x of families have geographically separate species with color variants, so there is a y chance that undiscovered black swans exist."
txt3 = "machine learning programs can perform tasks without being explicitly programmed to do so. it involves computers learning from data provided so that they carry out certain tasks. for simple tasks assigned to computers, it is possible to program algorithms telling the machine how to execute all steps required to solve the problem at hand on the computers part, no learning is needed. for more advanced tasks, it can be challenging for a human to manually create the needed algorithms. in practice, it can turn out to be more effective to help the machine develop its own algorithm, rather than having human programmers specify every needed step.the discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available. in cases where vast numbers of potential answers exist, one approach is to label some of the correct answers as valid. this can then be used as training data for the computer to improve the algorithms it uses to determine correct answers. for example, to train a system for the task of digital character recognition, the mnist dataset of handwritten digits has often been used.history and relationships to other fieldsthe term machine learning was coined in  by arthur samuel, an ibm employee and pioneer in the field of computer gaming and artificial intelligence. also the synonym selfteaching computers were used in this time period.by the early s an experimental learning machine with punched tape memory, called cybertron, had been developed by raytheon company to analyze sonar signals, electrocardiograms and speech patterns using rudimentary reinforcement learning. it was repetitively trained by a human operatorteacher to recognize patterns and equipped with a goof button to cause it to reevaluate incorrect decisions."
txt4 = "a representative book on research into machine learning during the s was nilssons book on learning machines, dealing mostly with machine learning for pattern classification. interest related to pattern recognition continued into the s, as described by duda and hart in . in  a report was given on using teaching strategies so that a neural network learns to recognize  characters  letters,  digits, and  special symbols from a computer terminal.tom m. mitchell provided a widely quoted, more formal definition of the algorithms studied in the machine learning field a computer program is said to learn from experience e with respect to some class of tasks t and performance measure p if its performance at tasks in t, as measured by p,  improves with experience e. this definition of the tasks in which machine learning is concerned offers a fundamentally operational definition rather than defining the field in cognitive terms. this follows alan turings proposal in his paper computing machinery and intelligence, in which the question can machines think is replaced with the question can machines do what we as thinking entities can do."
txt5 = "modern day machine learning has two objectives, one is to classify data based on models which have been developed, the other purpose is to make predictions for future outcomes based on these models."

In [None]:
%cd question_generation

In [None]:
from pipelines import pipeline

In [None]:
model = pipeline("question-generation", model="valhalla/t5-base-qg-hl")

In [None]:
len(txt1)

In [None]:
model(txt1)

In [None]:
model(txt2)

In [None]:
model(txt3)

In [None]:
model(txt4)

In [None]:
model(txt5)

# Get Keypharses

# Models

In [None]:
import pke

In [None]:
All_top = []

In [None]:
txt = ml_ftxt

## Unsupervised

### Graph Based Models

#### TopicRank

**NOTES on TopicRank**:
* unsupervised graph-based ranking model to keyphrase extraction
* uses a random walk algorithm -> to estimate the importance of each topic (node)

In [None]:
# initialize a TopicRank keyphrase extraction model
extractor = pke.unsupervised.TopicRank()

In [None]:
extractor.load_document(input=txt, language='en') # used to pre-process the text (sentence splitting, tokenization, Part-of-Speech tagging, stemming).

In [None]:
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

In [None]:
extractor.candidate_selection()  #identifying keyphrase candidates

 In **TopicRank**, candidate weighting is a three-step process:
1. candidate clustering (grouping keyphrase candidates into topics)
2. graph construction (building a complete-weighted-graph of topics)
3. rank topics (nodes) using a random walk algorithm


In [None]:
extractor.candidate_weighting()

In [None]:
# Get the N-best candidates (here, 5) as keyphrases
keyphrases = extractor.get_n_best(n=20, stemming=False)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

#### MultipartiteRank Model

In [None]:
extractor = pke.unsupervised.MultipartiteRank()

extractor.load_document(input=txt, language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

extractor.grammar_selection()

extractor.candidate_weighting()

keyphrases = extractor.get_n_best(n=20)


In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

#### TopicalPageRank Model

In [None]:
extractor = pke.unsupervised.TopicalPageRank()

# 2. load the content of the document.
extractor.load_document(input=txt,
                        language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

# 3. select the noun phrases as keyphrase candidates.
extractor.candidate_selection()

# 4. weight the keyphrase candidates using Single Topical PageRank.
#    Builds a word-graph in which edges connecting two words occurring
#    in a window are weighted by co-occurrence counts.
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

### Statistical models

#### FirstPhrases

In [None]:
 # 1. create a FirstPhrases baseline extractor.
extractor = pke.unsupervised.FirstPhrases()

# 2. load the content of the document.
extractor.load_document(input=txt,language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection()

# 4. weight the candidates using their position
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

#### TF-IDF

In [None]:
extractor = pke.unsupervised.TfIdf()        # initialize a keyphrase extraction model, here TFxIDF

extractor.load_document(input=txt)       # load the content of the document (str or spacy Doc)

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

extractor.candidate_selection()             # identify keyphrase candidates

extractor.candidate_weighting()             # weight keyphrase candidates

keyphrases = extractor.get_n_best(n=20)      # select the 5-best candidates as keyphrases

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

#### KPMiner Model

In [None]:
# 1. create a KPMiner extractor.
extractor = pke.unsupervised.KPMiner()

# 2. load the content of the document.
extractor.load_document(input=txt,language='en')

# 3. select {1-5}-grams that do not contain punctuation marks or
#    stopwords as keyphrase candidates. Set the least allowable seen
#    frequency to 5 and the number of words after which candidates are
#    filtered out to 200.
lasf = 5
cutoff = 200
extractor.candidate_selection(lasf=lasf, cutoff=cutoff)

# 4. weight the candidates using KPMiner weighting function.
#df = pke.load_document_frequency_file(input_file="path/to/df.tsv.gz")

#alpha = 2.3
#sigma = 3.0
# df=df, alpha=alpha, sigma=sigma

extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)


In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

## Supervised



#### Kea

In [None]:
# 1. create a Kea extractor.
extractor = pke.supervised.Kea()

# 2. load the content of the document.
stoplist = pke.lang.stopwords.get('en')
extractor.load_document(input=txt, language='en')

# 3. select 1-3 grams that do not start or end with a stopword as
#    candidates. Candidates that contain punctuation marks as words
#    are discarded.
extractor.candidate_selection()

# 4. classify candidates as keyphrase or not keyphrase.
#df = pke.load_document_frequency_file(input_file='path/to/df.tsv.gz')
#model_file = 'path/to/kea_model'
#model_file=model_file,df=df
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

# Compare

In [None]:
import numpy as np

In [None]:
x = np.array(All_top)
u = np.unique(x)

In [None]:
u

In [None]:
z = u[130:]
z

In [None]:
l = []
for i in z:
  cnt = 0
  for ii in range(len(All_top)):
    if i == All_top[ii][0]:
      cnt += 1
  if cnt >=2:
      l.append(i)       

In [None]:
l

In [None]:
import nltk
nltk.download('punkt')
  
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

sentences = tokenize_sentences(txt)
keyword_sentence_mapping = get_sentences_for_keyword(l, sentences)
        

In [None]:
keyword_sentence_mapping

# T5 Model with Keywords sentences

In [None]:
with_words = pipeline("question-generation", model="valhalla/t5-base-qg-hl")

In [None]:
type(keyword_sentence_mapping)

In [None]:
li = list(keyword_sentence_mapping.values())

In [None]:
ke = list(keyword_sentence_mapping.keys())

In [None]:
len(li)

In [None]:
len(ke)

In [None]:
ke[0]

In [None]:
li[0][8]

In [None]:
xx = []
for i in range(len(li[0])):
  if len(li[0][i]) < 256:
    print(i)
    xx.append(i)

In [None]:
li[0][8]

In [None]:
with_words(li[0][8])

In [None]:
for z in range(len(xx)):
  print(xx[z])
  with_words(li[0][xx[z]])

# T5 Question Generation with format like in hugging face

`<answer>Answer<context>Context`


In [None]:
li = list(keyword_sentence_mapping.values())
ke = list(keyword_sentence_mapping.keys())

In [None]:
ar = "<answer>"+ke[0][8]+"<context>"+li[0][8]

In [None]:
li[0][1]

In [None]:
%cd question_generator/
%load questiongenerator.py
from questiongenerator import QuestionGenerator
from questiongenerator import print_qa

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

In [None]:
qg = QuestionGenerator()

In [None]:
qa_list = qg.generate(
    ar, 
    num_questions=10, 
    answer_style='all'
)
print_qa(qa_list)