# Unstructured Text Dataset Creation with Wikipedia

> https://wikipedia.readthedocs.io/en/latest/code.html#api

In [1]:
import wikipedia
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [3]:
wikipedia.WikipediaPage('Something').content



  lis = BeautifulSoup(html).find_all('li')


DisambiguationError: "Something" may refer to: 
Something (concept)
indefinite pronoun
Something (Chairlift album)
Something (Shirley Bassey album)
Something (Shirley Scott album)
"Something" (Andrius Pojavis song)
"Something" (Beatles song)
"Something" (Girl's Day song)
"Something" (TVXQ song)
"Something" (Lasgo song)
Music From Another Dimension
Tales from the Lotus Pod
This War Is Ours
Joe Cocker!
Blackberry Way
Oral Fixation Vol. 2
Something for Everybody
Some Things
Anything (disambiguation)
Everything (disambiguation)
Nothing (disambiguation)
Thing (disambiguation)

In [6]:
wikipedia.search('StarWars', suggestion=True)

(['Star Wars',
  'List of Star Wars characters',
  'Star Wars Rebels',
  'Star Wars (film)',
  'List of Star Wars films',
  'Star Wars Trilogy',
  'List of Star Wars planets and moons',
  'Star Wars Resistance',
  'Star Wars: Battlefront',
  'Star Wars: The High Republic'],
 'star wars')

In [7]:
wikipedia.summary('StarWars')

"Star Wars is an American epic space opera media franchise created by George Lucas, which began with the eponymous 1977 film and quickly became a worldwide pop-culture phenomenon. The franchise has been expanded into various films and other media, including television series, video games, novels, comic books, theme park attractions, and themed areas, comprising an all-encompassing fictional universe. In 2020, its total value was estimated at US$70 billion, and it is currently the fifth-highest-grossing media franchise of all time.\nThe original film (Star Wars), retroactively subtitled Episode IV: A New Hope (1977), was followed by the sequels Episode V: The Empire Strikes Back (1980) and Episode VI: Return of the Jedi (1983), forming the original Star Wars trilogy. Lucas later returned to filmmaking to direct a prequel trilogy, consisting of Episode I: The Phantom Menace (1999), Episode II: Attack of the Clones (2002), and Episode III: Revenge of the Sith (2005). In 2012, Lucas sold h

In [8]:
content = wikipedia.WikipediaPage('StarWars').content

### Get topics from Wikipedia content:

In [44]:
topics = []
with open("./Ontology_Learning/dataset/starwars_topics.txt", "r") as file:
    for line in file:
        topics.append(line.strip())
len(topics)

125

In [46]:
total = 0
for i, topic in enumerate(topics):
    content = wikipedia.WikipediaPage(topic).content
    doc = nlp(content)
    sentences = [sent.string.strip() for sent in doc.sents]
    with open("./Ontology_Learning/dataset/starwars_text_dataset.txt", "a") as file:
        for sentence in sentences:
            file.write(sentence + "\n")
    total += len(sentences)
    print('Topic{}: {}, Sentences: {}, Total: {}'.format(i, topic, len(sentences), total))

Topic0: Luke Skywalker, Sentences: 391, Total: 391
Topic1: Anakin Skywalker, Sentences: 462, Total: 853
Topic2: George Lucas, Sentences: 405, Total: 1258
Topic3: Jedi, Sentences: 629, Total: 1887
Topic4: Millennium Falcon, Sentences: 209, Total: 2096
Topic5: Star Wars, Sentences: 465, Total: 2561
Topic6: Shmi Skywalker, Sentences: 324, Total: 2885
Topic7: Leia Organa, Sentences: 522, Total: 3407
Topic8: Han Solo, Sentences: 272, Total: 3679
Topic9: Kylo Ren, Sentences: 287, Total: 3966
Topic10: Padmé Amidala, Sentences: 332, Total: 4298
Topic11: Obi-Wan Kenobi, Sentences: 287, Total: 4585
Topic12: Tatooine, Sentences: 170, Total: 4755
Topic13: X-wing, Sentences: 123, Total: 4878
Topic14: Landspeeder, Sentences: 89, Total: 4967
Topic15: Sandcrawler, Sentences: 61, Total: 5028
Topic16: Snowspeeder, Sentences: 467, Total: 5495
Topic17: Skyhopper (Star Wars), Sentences: 467, Total: 5962
Topic18: Speeder bike, Sentences: 115, Total: 6077
Topic19: Star Destroyer, Sentences: 222, Total: 6299


In [20]:
dataset = pd.DataFrame(data=[['StarWars', content]], columns=['query_term','text'])
dataset

Unnamed: 0,query_term,text
0,StarWars,Star Wars is an American epic space opera medi...


In [21]:
dataset = dataset.append(pd.DataFrame(data=[['Anakin Skywalker', wikipedia.WikipediaPage('Anakin Skywalker').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['George Lucas', wikipedia.WikipediaPage('George Lucas').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['Jedi', wikipedia.WikipediaPage('Jedi').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['Millennium Falcon', wikipedia.WikipediaPage('Millennium Falcon').content]], columns=['query_term','text']), ignore_index=True)
dataset

Unnamed: 0,query_term,text
0,StarWars,Star Wars is an American epic space opera medi...
1,Anakin Skywalker,Darth Vader is a fictional character in the St...
2,George Lucas,"George Walton Lucas Jr. (born May 14, 1944) is..."
3,Jedi,"The Jedi (), Jedi Knights, or the Knights of t..."
4,Millennium Falcon,The Millennium Falcon is a fictional starship ...


In [22]:
dataset.to_csv('starwars_data.csv')

## Text Cleaning

In [157]:
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "ner"])
nlp.pipeline

[('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f8e698fe980>)]

In [None]:
for index, row in dataset.iterrows():
    doc = nlp(row.text)
    sentences = [sent.string.strip() for sent in doc.sents]
    #print(sentences)

In [158]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in (":"):
            doc[token.i].is_sent_start = False
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")
nlp.pipeline

[('set_custom_boundaries', <function __main__.set_custom_boundaries(doc)>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f8e698fe980>)]

In [159]:
parsed = []
doc = nlp(dataset.iloc[0].text)
sentences = [sent.string.strip() for sent in doc.sents]
for sent in sentences:
    if len(sent) > 10 and '=' not in sent:
        parsed.append(sent)

In [172]:
nlp = spacy.load("en_core_web_lg")

StarWars
Anakin Skywalker
George Lucas
Jedi
Millennium Falcon

In [178]:
doc = nlp(dataset.iloc[4].text)
results = pd.DataFrame(columns=['Text', 'Start', 'End', 'Label'])

for ent in doc.ents:  
    results = results.append({'Text':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_}, ignore_index=True)
results

Unnamed: 0,Text,Start,End,Label
0,The Millennium Falcon,0,21,ORG
1,Joe Johnston,86,98,PERSON
2,Star Wars,113,122,WORK_OF_ART
3,1977,124,128,DATE
4,The Star Wars Holiday Special,163,192,ORG
...,...,...,...,...
398,Solo:,18291,18296,WORK_OF_ART
399,Millennium Falcon,18351,18368,DATE
400,Wookieepedia,18372,18384,GPE
401,Notes,18421,18426,PRODUCT


In [179]:
results.to_csv('./entities/MillenniumFalcon_entities.csv')

### Read OL Dataset

In [42]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [40]:
dataset = pd.read_csv('./Ontology_Learning/dataset/starwars_OL_dataset.txt', delimiter='\n', header=None)
dataset

Unnamed: 0,0
0,Luke Skywalker is a fictional character and th...
1,"The character, portrayed by Mark Hamill, is an..."
2,He is the twin brother of Rebellion leader Pri...
3,The now non-canon Star Wars expanded universe ...
4,On their list of the 100 Greatest Fictional Ch...
...,...
1299,Lego has also made a version of the Falcon to ...
1300,It is called the Kessel Run Millennium Falcon.
1301,"On May 31, 2019, a full sized replica of the F..."
1302,An identical version of the attraction opened ...


In [68]:
len(set(keywords))

787

### Automatic Dataset Creation

In [35]:
start_topic = 'Military Weapons'
topics = wikipedia.search(start_topic, suggestion=False)
topics

['Military technology',
 'Weapon',
 'Assault weapon',
 'List of historical equipment of the Canadian military',
 'Military',
 'Arms industry',
 'Directed-energy weapon',
 'Military robot',
 'Non-lethal weapon',
 'Service pistol']

In [3]:
curr_topic = 'Military Weapons'
topic_threshold = 100
all_topics = []

topic_count = 0
index = 0
while topic_count < topic_threshold:
    all_topics = list(set(all_topics))
    topics = wikipedia.search(curr_topic, suggestion=False)
    all_topics += topics
    curr_topic = all_topics[index]
    index += 1
    topic_count = len(all_topics)
    print('topic_count: {}'.format(topic_count))

topic_count: 10
topic_count: 20
topic_count: 28
topic_count: 28
topic_count: 28
topic_count: 36
topic_count: 44
topic_count: 53
topic_count: 61
topic_count: 67
topic_count: 67
topic_count: 67
topic_count: 74
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 82
topic_count: 91
topic_count: 91
topic_count: 100


In [5]:
dataset_name = 'military_weapons_dataset.txt'

total = 0
for i, topic in enumerate(all_topics):
    try:
        content = wikipedia.WikipediaPage(topic).content
        doc = nlp(content)
        sentences = [sent.string.strip() for sent in doc.sents]
        with open('./data/' + dataset_name, 'a') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        total += len(sentences)
        print('Topic{}: {}, Sentences: {}, Total: {}'.format(i, topic, len(sentences), total))
    except:
        continue

Topic0: List of equipment of the Venezuelan Army, Sentences: 8, Total: 8
Topic1: United States Military Government in Cuba, Sentences: 46, Total: 54
Topic2: Military technology, Sentences: 250, Total: 304
Topic3: Russian Armed Forces, Sentences: 370, Total: 674
Topic4: List of states with nuclear weapons, Sentences: 223, Total: 897
Topic5: Brazilian Military Junta of 1969, Sentences: 54, Total: 951
Topic6: List of countries by military expenditures, Sentences: 33, Total: 984
Topic7: Mortar (weapon), Sentences: 229, Total: 1213
Topic8: Astra (weapon), Sentences: 30, Total: 1243
Topic9: List of active Russian Air Force aircraft, Sentences: 8, Total: 1251
Topic10: List of Russian flags, Sentences: 51, Total: 1302
Topic11: List of equipment of the Ukrainian Ground Forces, Sentences: 33, Total: 1335
Topic12: Outline of natural science, Sentences: 704, Total: 2039
Topic13: National Reorganization Process, Sentences: 209, Total: 2248
Topic14: Outline of formal science, Sentences: 345, Total: 