In [1]:
"""
Two summarizer: with t5-base and bart-large
INPUT args: p - list of text (in string format);
            shuffle - bool, whether to shuffle paragraphs, default to False;
            min-length - int, default to 0.
OUTPUTs: summary - string, concatenate summaries of each part of original text;
         (abandoned) sumlist - list, list of summaries of each part of original text.
"""
from transformers import AutoModelWithLMHead, AutoTokenizer
import random

model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def summarize(p, shuffle=False,min_length=0):
    summary = str()
    index = list(range(len(p)))
    if shuffle==True:
        random.shuffle(index)
    
    for i in index:
        if len(p[i])<100:
            continue
        ip = tokenizer.encode("summarize: " + p[i], return_tensors="pt", max_length=512, truncation=True)
        op = model.generate(ip, max_length=150, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        op = tokenizer.decode(op[0])
        summary += op
        summary += " "
    summary=summary[:-1]
    return summary

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
p_model = AutoModelWithLMHead.from_pretrained("facebook/bart-large-cnn")
p_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

def p_summarize(p, shuffle=False,min_length=0):
    summary = str()
    index = list(range(len(p)))
    if shuffle==True:
        random.shuffle(index)
    
    for i in index:
        if len(p[i])<100:
            continue
        ip = p_tokenizer.encode("summarize: " + p[i], return_tensors="pt", max_length=1024, truncation=True)
        op = p_model.generate(ip, max_length=150, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        op = p_tokenizer.decode(op[0])
        op = op[7:]
        summary += op
        summary += " "
    summary = summary[:-1]
    return summary

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
"""
Functions to fetch text
INPUT: name of a wikipedia page
get OUTPUT: list, paragraphs of the introduction part
get_full OUTPUT: dictionary, each key is the name of one part of the main body
                 in the input wikipedia page, contents are list of paragraphs of
                 correspondent part
"""
import wikipedia
from bs4 import BeautifulSoup

def get(n):
    para = wikipedia.summary(n)
    para = para.split('\n')
    return para

def wash(sec,topic):
    for line in sec[:]:
        if line==" "*len(line):
            sec.remove(line)
        elif line[:-1]==" "*(len(line)-1):
            sec.remove(line)
        else:
            continue
    sec = [line.strip() for line in sec]
    equations = BeautifulSoup(topic.html()).find_all('annotation')
    limit = len(equations)
    i=1
    for index, line in enumerate(sec):
        if i>=limit:
                break
        if line==equations[i-1].text:
            sec[index]="formula "+str(i)
            i+=1
    return sec


def dictcreator(sec,stage=1):
    # store each section in a dictionary
    part_list=[]
    if stage==1:
        for i, part in enumerate(sec):
            try:
                if part[:3]=="== ":
                    part_list.append(i)
            except:
                continue
    if stage==2:
        for i, part in enumerate(sec):
            try:
                if part[:4]=="=== ":
                    part_list.append(i)
            except:
                continue
        if len(part_list)==0:
            return sec
    text={}
    if len(part_list)==1:
        text[sec[0]]=sec[1:]
    else:
        for i, index in enumerate(part_list):
            if i==0:
                text[sec[index]]=sec[1:part_list[i+1]]
                continue
            if i==len(part_list)-1:
                text[sec[index]]=sec[index+1:]
                continue
            else:
                text[sec[index]]=sec[index+1:part_list[i+1]]
    return text


def get_full(n):
    topic = wikipedia.page(n)
    sec = topic.content.split('\n')
    sec = wash(sec,topic)
    # remove introduction and endings
    for i, part in enumerate(sec):
        if part[:2]=="==":
            sec = sec[i:]
            break
    for i, part in enumerate(sec):
        if part=="== See also ==":
            sec = sec[:i]
            break
    text = dictcreator(sec)
    for key in text:
        text[key] = dictcreator(text[key],stage=2)
    return text

In [70]:
abc=get_full("Italy")
print(abc)

{'== Name ==': ['Hypotheses for the etymology of the name "Italia" are numerous. One is that it was borrowed via Greek from the Oscan Víteliú \'land of calves\' (cf. Lat vitulus "calf", Umb vitlo "calf"). Greek historian Dionysius of Halicarnassus states this account together with the legend that Italy was named after Italus, mentioned also by Aristotle and Thucydides.According to Antiochus of Syracuse, the term Italy was used by the Greeks to initially refer only to the southern portion of the Bruttium peninsula corresponding to the modern province of Reggio and part of the provinces of Catanzaro and Vibo Valentia in southern Italy. Nevertheless, by his time the larger concept of Oenotria and "Italy" had become synonymous and the name also applied to most of Lucania as well. According to Strabo\'s Geographica, before the expansion of the Roman Republic, the name was used by Greeks to indicate the land between the strait of Messina and the line connecting the gulf of Salerno and gulf o

In [40]:
from bs4 import BeautifulSoup
import wikipedia

topic = wikipedia.page('second order logic')
equations = BeautifulSoup(topic.html()).find_all('annotation')

for eq in equations:
    print(eq.text)#.split('{\\displaystyle ')[1][:-1])

{\displaystyle \forall P\,\forall x(Px\lor \neg Px)}
{\displaystyle \Sigma _{0}^{1}}
{\displaystyle \Pi _{0}^{1}}
{\displaystyle \Sigma _{1}^{1}}
{\displaystyle \exists R_{0}\ldots \exists R_{m}\phi }
{\displaystyle \phi }
{\displaystyle \Sigma _{1}^{1}}
{\displaystyle \Pi _{1}^{1}}
{\displaystyle \Sigma _{k+1}^{1}}
{\displaystyle \exists R_{0}\ldots \exists R_{m}\phi }
{\displaystyle \phi }
{\displaystyle \Pi _{k}^{1}}
{\displaystyle \Pi _{k+1}^{1}}
{\displaystyle \forall R_{0}\ldots \forall R_{m}\phi }
{\displaystyle \phi }
{\displaystyle \Sigma _{k}^{1}}
{\displaystyle \langle +,\cdot ,\leq \rangle }


In [55]:
sec=wikipedia.page("second order logic").content
sec = sec.split('\n')
for line in sec[:]:
    if line==" "*len(line):
        sec.remove(line)
    elif line[:-1]==" "*(len(line)-1):
        sec.remove(line)
    else:
        continue
sec = [line.strip() for line in sec]
topic = wikipedia.page('second order logic')
equations = BeautifulSoup(topic.html()).find_all('annotation')
limit = len(equations)
i=1
for index, line in enumerate(sec):
    if line==equations[i-1].text:
        sec[index]="formula "+str(i)
        if i==limit:
            break
        i+=1
print(sec)
"""
前项是‘：’而且没有句号结尾的要把嵌镶后项一起加起来
formula要把前后项一起加起来
思路，搜索对象的拷贝，操作原对象
"""

['In logic and mathematics second-order logic is an extension of first-order logic, which itself is an extension of propositional logic. Second-order logic is in turn extended by higher-order logic and type theory.', 'First-order logic quantifies only variables that range over individuals (elements of the domain of discourse); second-order logic, in addition, also quantifies over relations. For example, the second-order sentence', 'formula 1', 'says that for every formula P, and every individual x, either Px is true or not(Px) is true (this is the principle of bivalence). Second-order logic also includes quantification over sets, functions, and other variables as explained in the section Syntax and fragments. Both first-order and second-order logic use the idea of a domain of discourse (often called simply the "domain" or the "universe"). The domain is a set over which individual elements may be quantified.', '== Examples ==', 'In first-order logic, one can quantify over individuals, b

'\n前项是‘：’而且没有句号结尾的要把嵌镶后项一起加起来\nformula要把前后项一起加起来\n思路，搜索对象的拷贝，操作原对象\n'

In [50]:
string=" "
sequence=["h","a","i","e","o"]
sequence[0]=sequence[0]+sequence[1]+sequence[2]
sequence.remove(sequence[1])
sequence.remove(sequence[2])
print(sequence)

['hai', 'i', 'o']


In [4]:
'''
Evaluation tool: error rate
'''
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

def get_error_rate(text):
    matches = tool.check(text)
    error = len(matches)
    # uncompleteness has more weight
    if text[-1]!="." and text[-1]!="!" and text[-1]!="?":
        error += 3
    text = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    return error/len(text[0])

self._url: http://127.0.0.1:8081/v2/


In [5]:
'''
bart-t5
'''
names=[""]
summary_s=[]
for name in names:
    p = get(name)
    summary = p_summarize(p,min_length=40)
    summary_s.append(summary)
    print("  ")
    print("Finish summarization of " + name)
    print(" ")

  
Finish summarization of Italy
 
  
Finish summarization of Quasar
 
  
Finish summarization of Black hole
 
  
Finish summarization of Central processing unit
 


In [10]:
error_rate = 0
nor = len(summary_s)
for summary in summary_s:
    summary = [summary]
    s = summarize(summary,min_length=80)
    print(s)
    error_rate+=get_error_rate(s)
print("Ave ER:")
print(error_rate/nor)

Italy is a unitary parliamentary republic with Rome as its capital. it shares land borders with France, Switzerland, Austria, Slovenia, and microstates of Vatican City and San Marino. with around 60 million inhabitants, it is the third-most populous member state of the eu. it has been home to numerous ancient peoples and cultures, the most predominant are the Indo-European italic peoples.
a pulsar is a highly magnetized rotating neutron star that emits beams of electromagnetic radiation out of its magnetic poles. pulsars are one of the candidates for the source of ultra-high-energy cosmic rays. the first extrasolar planets were discovered around a pulsars, PSR B1257+12.
black hole acts like an ideal black body, as it reflects no light. first direct image of a black hole and its vicinity was published on 10 April 2019. eleven gravitational wave events have been observed that originated from ten merging black holes. the presence of a black hole can be inferred through its interaction wit

In [11]:
"""
summarize section-wise
"""
p = get_full("Italy")
summary_wise=[] # list of summaries of sections
for key in p:
    summary = p_summarize(p[key],min_length=40) # summary of one section
    print(summary)
    summary_wise.append(summary)
print(summary_wise)
s = summarize(summary_wise,min_length=80)
print(s)

According to Strabo's Geographica, before the expansion of the Roman Republic, the name was used by Greeks to indicate the land between the strait of Messina and the line connecting the gulf of Salerno and gulf of Taranto. In 264 BC, Roman Italy extended from the Arno and Rubicon rivers of the centre-north to the entire south. The northern area of Cisalpine Gaul was occupied by Rome in the 220s BC and became considered geographically and de facto part of Italy.
Thousands of Paleolithic-era artifacts have been recovered from Monte Poggiolo and dated to around 850,000 years before the present. They are the oldest evidence of first hominins habitation in the peninsula. The ancient peoples of pre-Roman Italy were Indo-European peoples. The main historic peoples of possible non-Indo-European heritage include the Etruscans of central and northern Italy. The prehistoric Sardinians gave birth to the Nuragic civilisation. Rome was ruled for a period of 244 years by a monarchical system. Initial

Italy is located in Southern Europe, between latitudes 35° and 47° N, and longitudes 6° and 19° E. To the north, Italy borders France, Switzerland, Austria, and Slovenia. The sovereign states of San Marino and the Vatican City are enclaves within Italy. Campione d'Italia is an Italian exclave in Switzerland. Italy's total area is 301,230 square kilometres (116,306 sq mi), of which 294,020 km2 (113,522 sq mi) is land. The Apennine Mountains form the peninsula's backbone, and the Alps form most of its northern boundary. The Po Valley is the largest plain in Italy, with 46,000 km2 (18,000 sq mi) It represents over 70% of the total plain area in the country. Most of the small islands and archipelagos in the south are volcanic islands. There are also active volcanoes: Mount Etna in Sicily (the largest active volcano in Europe), Vulcano, Stromboli, and Vesuvius. There are also volcanoes on mainland Europe. The five largest lakes in Italy are Garda, Maggiore, Como, Trasimeno and Bolsena. Gard

Italy is a founding member of the G7, the Eurozone and the OECD. It is a highly developed country, with the world's 8th highest quality of life. The country is well known for its creative and innovative business. Italy is the world's sixth largest manufacturing country. It is characterised by a smaller number of global multinational corporations than other economies of comparable size. Its closest trade ties are with the other countries of the European Union, with whom it conducts 59% of its total trade. Fiat Chrysler Automobiles or FCA is currently the world's seventh-largest auto maker. The country boasts a wide range of acclaimed products, from very compact city cars to luxury supercars such as Maserati, Lamborghini and Ferrari. Italy has been hit hard by the Financial crisis of 2007–08, that exacerbated the country's structural problems. A gaping North–South divide is a major factor of socio-economic weakness. The richest province, Alto Adige-South Tyrol, earns 152% of the national

the northern area of Cisalpine Gaul was occupied by Rome in the 220s BC and became considered geographically and de facto part of Italy. the northern area of Cisalpine Gaul was occupied by Rome in the 220s BC and became considered geographically and de facto part of it. the name was used by Greeks to indicate the land between the strait of Messina and the line connecting the gulf of Salerno and gulf of Taranto  Paleolithic-era artifacts have been recovered from Monte Poggiolo. they are the oldest evidence of first hominins habitation in the peninsula. the ancient peoples of pre-Roman Italy were Indo-European peoples. the main historic peoples of possible non-Indo-european heritage include the Etruscans of central and northern Italy. Italy's total area is 301,230 square kilometres (116,306 sq mi) of which 294,020 km2 (113,522 sq mi) is land. there are 14 volcanoes in Italy, four of which are active. most of the rivers of Italy drain either into the Adriatic Sea, such as the Po, Piave, A

In [6]:
from QG import QueGenerator
qg = QueGenerator(model_dir="/Users/southdam/Desktop/Master-Project-Flashcard")

text_example = '''
Mars is the fourth planet from the Sun and the second-smallest planet in 
the Solar System. It carries the name of the Roman god of war and is often 
referred to as the "Red Planet" The latter refers to the effect of iron oxide 
prevalent on Mars's surface, which gives it a reddish appearance.
'''.replace('\n','')

qs=qg.generate(text_example)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
for qa in qs:
    if qa["isGood"]:
        print(qa)

{'answer': 'fourth', 'question': 'Where does Mars rank among planets from the Sun?', 'isGood': True}
{'answer': 'second-smallest', 'question': 'How small is Mars in the solar system?', 'isGood': True}
{'answer': 'iron oxide', 'question': "What is present on Mars's surface that gives it a reddish appearance?", 'isGood': True}
{'answer': 'Red Planet', 'question': "What is the Roman god of war's nickname for Mars?", 'isGood': True}
