In [1]:
#!pip install beautifulsoup
#!wget https://dumps.wikimedia.org/enwiki/20190420/enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2
#!bunzip2 enwiki-20190420-pages-articles-multistream1.xml-p10p30302.bz2    

In [2]:
# Count lines, words, characters in XML data file
!wc enwiki-20190420-pages-articles-multistream1.xml-p10p30302

 4586335 74563548 642819893 enwiki-20190420-pages-articles-multistream1.xml-p10p30302


In [1]:
!wc enwiki-20190420-pages-articles-multistream1.xml-p10p30302

 4586335 74563548 642819893 enwiki-20190420-pages-articles-multistream1.xml-p10p30302


In [2]:
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def get_pages_from_xml_file(filename, start_tag='<page>', end_tag='</page>'):
    """Yields each page from the specified XML data file."""
    page = None
    with open(filename) as f:
        for line in f:
            if start_tag in line:
                page = []
                page.append(line)
            elif end_tag in line:
                page.append(line)
                page_xml = ''.join(page)
                yield page_xml
                page = None
            else:
                if page is not None:
                    page.append(line)

In [4]:
filename = 'enwiki-20190420-pages-articles-multistream1.xml-p10p30302'
pages = list(get_pages_from_xml_file(filename))

In [5]:
def get_title_from_page_xml(page_xml):
    """Returns the title of the given page."""
    soup = BeautifulSoup(page_xml, 'lxml')
    return soup.select_one('title').text

In [6]:
len(pages)  # Show the number of pages

19822

In [7]:
%time titles = [get_title_from_page_xml(page) for page in pages]

CPU times: user 42.4 s, sys: 4.31 s, total: 46.7 s
Wall time: 48.9 s


In [8]:
titles[:5] + ['...'] + titles[-5:]  # Show the first and last 5 titles

['AccessibleComputing',
 'Anarchism',
 'AfghanistanHistory',
 'AfghanistanGeography',
 'AfghanistanPeople',
 '...',
 'The Lord of the Rings/One Ring',
 'Tax Freedom Day',
 'Tax',
 'Transhumanism',
 'TARDIS']

In [14]:
print(pages[3])  # Print the first page

  <page>
    <title>AfghanistanGeography</title>
    <ns>0</ns>
    <id>14</id>
    <redirect title="Geography of Afghanistan" />
    <revision>
      <id>783865160</id>
      <parentid>407008307</parentid>
      <timestamp>2017-06-05T04:18:23Z</timestamp>
      <contributor>
        <username>Tom.Reding</username>
        <id>9784415</id>
      </contributor>
      <minor />
      <comment>+{{Redirect category shell}} using [[Project:AWB|AWB]]</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Geography of Afghanistan]]

{{Redirect category shell|1=
{{R from CamelCase}}
}}</text>
      <sha1>39r4w8qg62iexlyskf0ga3tblagtl8x</sha1>
    </revision>
  </page>



In [24]:
def get_text_from_page_xml(page):
    """Returns the text of the given page."""
    soup = BeautifulSoup(page, 'lxml')
    return soup.select_one('text').text

In [25]:
#BeautifulSoup()
get_text_from_page_xml(pages[0])

'#REDIRECT [[Computer accessibility]]\n\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}'

In [26]:
page = pages[0]
soup = BeautifulSoup(page, 'lxml')
text = get_text_from_page_xml(page)
text, type(text)

('#REDIRECT [[Computer accessibility]]\n\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}',
 str)

In [27]:
data = []
for page in pages:
    row = {
        'title': get_title_from_page_xml(page),
        'text': get_text_from_page_xml(page),
    }
    data.append(row)

In [29]:
data[8]

{'title': 'AfghanistanTransnationalIssues',
 'text': '#REDIRECT [[Foreign relations of Afghanistan]]\n\n{{Redirect category shell|1=\n {{R from CamelCase}}\n}}'}

In [30]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,title
0,#REDIRECT [[Computer accessibility]]\n\n{{R fr...,AccessibleComputing
1,{{redirect2|Anarchist|Anarchists|the fictional...,Anarchism
2,#REDIRECT [[History of Afghanistan]]\n\n{{Redi...,AfghanistanHistory
3,#REDIRECT [[Geography of Afghanistan]]\n\n{{Re...,AfghanistanGeography
4,#REDIRECT [[Demographics of Afghanistan]]\n\n{...,AfghanistanPeople


### Challenge

#### 1. Create a Pandas dataframe containing the title and text of each page.

* Implement the `get_text_from_page_xml` function above.
* Re-create the dataframe with the text field filled in.

#### 2. Identify the five pages that have the _longest_ text.

* Find the length of each page's `<text>...</text>` element and add it to your dataframe.
* Sort the data frame by text length, descending.
* What are the titles of the five longest articles?

In [31]:
df['text_char_length'] = df['text'].map(lambda x:len(x))

In [34]:
df = df.sort_values('text_char_length', ascending=False).copy()

In [44]:
titles_of_longest_5 = list(df['title'][:5].values)

In [45]:
titles_of_longest_5

['List of compositions by Johann Sebastian Bach',
 'Pakistan',
 'Philippines',
 'Foreign relations of India',
 'History of India']