In [1]:
import sys
sys.path.append("..")

import spacy
from src.iuextract.extract import label_ius, segment_ius
from src.iuextract.utils import iu2str, gen_iu_collection, get_iu_str_list
from src.iuextract.data import clean_str
import re
nlp = spacy.load("en_core_web_lg")

text = "Apollo 11 (July 16–24, 1969) was the American spaceflight that first landed humans on the Moon. Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle on July 20, 1969, at 20:17 UTC, and Armstrong became the first person to step onto the Moon's surface six hours and 39 minutes later, on July 21 at 02:56 UTC. Aldrin joined him 19 minutes later, and they spent about two and a quarter hours together exploring the site they had named Tranquility Base upon landing. Armstrong and Aldrin collected 47.5 pounds (21.5 kg) of lunar material to bring back to Earth as pilot Michael Collins flew the Command Module Columbia in lunar orbit, and were on the Moon's surface for 21 hours, 36 minutes before lifting off to rejoin Columbia."
parsed = nlp(text)

In [2]:
label_ius(parsed.sents)
print(get_iu_str_list(parsed))

['Apollo 11 was the American spaceflight', '(July 16–24, 1969)', 'that first landed humans on the Moon.', 'Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle', 'on July 20, 1969,', 'at 20:17 UTC,', 'and Armstrong became the first person to step six hours and 39 minutes later, on July 21 at 02:56 UTC.', "onto the Moon 's surface", 'Aldrin joined him 19 minutes later,', 'and they spent about two and a quarter hours together exploring the site', 'they had named Tranquility Base', 'upon landing.', 'Armstrong and Aldrin collected 47.5 pounds of lunar material and were for 21 hours,', '(21.5 kg)', 'to bring back to Earth', 'as pilot Michael Collins flew the Command Module Columbia in lunar orbit,', "on the Moon 's surface", '36 minutes before lifting off', 'to rejoin Columbia.']


In [3]:
coll, disc = gen_iu_collection(parsed)
res = []
for key, value in coll.items():
    s = [str(tok) for tok in value]
    s = ' '.join(s)
    s = clean_str(s)
    s = re.sub(r"\s+\,", ",", s)
    s = re.sub(r"\s+\:", ":", s)
    s = re.sub(r"\s+\;", ";", s)
    s = re.sub(r"\s+\.", ".", s)
    s = re.sub(r"\(\s+", "(", s)
    s = re.sub(r"\s+\)", ")", s)
    s = re.sub(r"\[\s+", "[", s)
    s = re.sub(r"\s+\]", "]", s)
    s = re.sub(r"\{\s+", "{", s)
    s = re.sub(r"\s+\}", "}", s)
    res.append(s)
res

['Apollo 11 was the American spaceflight',
 '(July 16–24, 1969)',
 'that first landed humans on the Moon.',
 'Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle',
 'on July 20, 1969,',
 'at 20:17 UTC,',
 'and Armstrong became the first person to step six hours and 39 minutes later, on July 21 at 02:56 UTC.',
 "onto the Moon 's surface",
 'Aldrin joined him 19 minutes later,',
 'and they spent about two and a quarter hours together exploring the site',
 'they had named Tranquility Base',
 'upon landing.',
 'Armstrong and Aldrin collected 47.5 pounds of lunar material and were for 21 hours,',
 '(21.5 kg)',
 'to bring back to Earth',
 'as pilot Michael Collins flew the Command Module Columbia in lunar orbit,',
 "on the Moon 's surface",
 '36 minutes before lifting off',
 'to rejoin Columbia.']

In [4]:
segs = segment_ius(text, mode='str')
segs

['Apollo 11 was the American spaceflight',
 '(July 16–24, 1969)',
 'that first landed humans on the Moon.',
 'Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle',
 'on July 20, 1969,',
 'at 20:17 UTC,',
 'and Armstrong became the first person to step six hours and 39 minutes later, on July 21 at 02:56 UTC.',
 "onto the Moon 's surface",
 'Aldrin joined him 19 minutes later,',
 'and they spent about two and a quarter hours together exploring the site',
 'they had named Tranquility Base',
 'upon landing.',
 'Armstrong and Aldrin collected 47.5 pounds of lunar material and were for 21 hours,',
 '(21.5 kg)',
 'to bring back to Earth',
 'as pilot Michael Collins flew the Command Module Columbia in lunar orbit,',
 "on the Moon 's surface",
 '36 minutes before lifting off',
 'to rejoin Columbia.']