# Unstructured Text Dataset Creation with Wikipedia

> https://wikipedia.readthedocs.io/en/latest/code.html#api

In [53]:
import wikipedia
import pandas as pd
import numpy as np

In [46]:
wikipedia.search('StarWars', suggestion=True)

(['Star Wars',
  'List of Star Wars characters',
  'Star Wars: Battlefront',
  'Star Wars Rebels',
  'List of Star Wars films',
  'Star Wars Resistance',
  'Star Wars Trilogy',
  'Star Wars (film)',
  'Star Wars: The High Republic',
  'Rey (Star Wars)'],
 'star wars')

In [45]:
wikipedia.summary('StarWars')

"Star Wars is an American epic space opera media franchise created by George Lucas, which began with the eponymous 1977 film and quickly became a worldwide pop-culture phenomenon. The franchise has been expanded into various films and other media, including television series, video games, novels, comic books, theme park attractions, and themed areas, comprising an all-encompassing fictional universe. In 2020, its total value was estimated at US$70 billion, and it is currently the fifth-highest-grossing media franchise of all time.\nThe original film (Star Wars), retroactively subtitled Episode IV: A New Hope (1977), was followed by the sequels Episode V: The Empire Strikes Back (1980) and Episode VI: Return of the Jedi (1983), forming the original Star Wars trilogy. Lucas later returned to filmmaking to direct a prequel trilogy, consisting of Episode I: The Phantom Menace (1999), Episode II: Attack of the Clones (2002), and Episode III: Revenge of the Sith (2005). In 2012, Lucas sold h

In [80]:
content = wikipedia.WikipediaPage('StarWars').content

In [93]:
dataset = pd.DataFrame(data=[['StarWars', content]], columns=['query_term','text'])
dataset

Unnamed: 0,query_term,text
0,StarWars,Star Wars is an American epic space opera medi...


In [94]:
dataset = dataset.append(pd.DataFrame(data=[['Anakin Skywalker', wikipedia.WikipediaPage('Anakin Skywalker').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['George Lucas', wikipedia.WikipediaPage('George Lucas').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['Jedi', wikipedia.WikipediaPage('Jedi').content]], columns=['query_term','text']), ignore_index=True)
dataset = dataset.append(pd.DataFrame(data=[['Millennium Falcon', wikipedia.WikipediaPage('Millennium Falcon').content]], columns=['query_term','text']), ignore_index=True)
dataset

Unnamed: 0,query_term,text
0,StarWars,Star Wars is an American epic space opera medi...
1,Anakin Skywalker,Darth Vader is a fictional character in the St...
2,George Lucas,"George Walton Lucas Jr. (born May 14, 1944) is..."
3,Jedi,The Jedi () are the main protagonists of many ...
4,Millennium Falcon,The Millennium Falcon is a fictional starship ...


In [95]:
dataset.to_csv('starwars_data.csv')

## Text Cleaning

In [157]:
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "ner"])
nlp.pipeline

[('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f8e698fe980>)]

In [None]:
for index, row in dataset.iterrows():
    doc = nlp(row.text)
    sentences = [sent.string.strip() for sent in doc.sents]
    #print(sentences)

In [158]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in (":"):
            doc[token.i].is_sent_start = False
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")
nlp.pipeline

[('set_custom_boundaries', <function __main__.set_custom_boundaries(doc)>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f8e698fe980>)]

In [159]:
parsed = []
doc = nlp(dataset.iloc[0].text)
sentences = [sent.string.strip() for sent in doc.sents]
for sent in sentences:
    if len(sent) > 10 and '=' not in sent:
        parsed.append(sent)

In [172]:
nlp = spacy.load("en_core_web_lg")

StarWars
Anakin Skywalker
George Lucas
Jedi
Millennium Falcon

In [178]:
doc = nlp(dataset.iloc[4].text)
results = pd.DataFrame(columns=['Text', 'Start', 'End', 'Label'])

for ent in doc.ents:  
    results = results.append({'Text':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_}, ignore_index=True)
results

Unnamed: 0,Text,Start,End,Label
0,The Millennium Falcon,0,21,ORG
1,Joe Johnston,86,98,PERSON
2,Star Wars,113,122,WORK_OF_ART
3,1977,124,128,DATE
4,The Star Wars Holiday Special,163,192,ORG
...,...,...,...,...
398,Solo:,18291,18296,WORK_OF_ART
399,Millennium Falcon,18351,18368,DATE
400,Wookieepedia,18372,18384,GPE
401,Notes,18421,18426,PRODUCT


In [179]:
results.to_csv('./entities/MillenniumFalcon_entities.csv')