# Introducing Wikipedia Data


In [1]:
import os

os.listdir("wiki")

['Margo_Reuten.html',
 'Hope_7_(album).html',
 'Nejat_Alp.html',
 'Pr%C3%A9liminaires.html',
 'EDP_Sarichioi_Wind_Farm.html',
 'Kenny_Cordray.html',
 'Amborella.html',
 'Valentin_Yanin.html',
 'Yarumal.html',
 'Urs_Burkart.html',
 'Charles_Stuart_(rugby_union).html',
 'Fahy,_County_Mayo.html',
 'Acacia_dermatophylla.html',
 'Peter_Collingwood.html',
 'Manhattan_Murder_Mystery.html',
 'Reb_Russell.html',
 'Shigeo_Kurata.html',
 '83_(number).html',
 'I_Am_Cold.html',
 'KMTZ.html',
 'Vanavara_Airport.html',
 'Kul_Gul.html',
 'Lee_Henderson_Watkins.html',
 'Zgornji_Otok.html',
 'Watsonville_Junction,_California.html',
 'Cosmopterix_similis.html',
 'Tapat%C3%ADo_hot_sauce.html',
 'Plze%C5%88_Zoo.html',
 'Olivaceous_flatbill.html',
 'Hermann_Nuding.html',
 'Yoshinkan.html',
 'Shellie_Morris.html',
 'Derek_Acorah.html',
 'List_of_Argentine_Primera_Divisi%C3%B3n_transfers_January_2011.html',
 'DiGiorgio_Corporation.html',
 'Peltigera_membranacea.html',
 'Hepatitis_B_virus.html',
 'Vaccinium_pa

In [2]:
print(len(os.listdir("wiki")))

999


In [4]:
with open("wiki/James_L._Hull.html", 'r') as f:
    print(f.readlines())

['<!DOCTYPE html>\n', '<html class="client-nojs" lang="en" dir="ltr">\n', '<head>\n', '<meta charset="UTF-8"/>\n', '<title>James L. Hull - Wikipedia</title>\n', '<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n', '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"James_L._Hull","wgTitle":"James L. Hull","wgCurRevisionId":731564355,"wgRevisionId":731564355,"wgArticleId":28029244,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All stub articles","1873 births","1928 deaths","United States Navy Medal of Honor recipients","United States Navy sailors","American military personnel of the Spanish–American War","People from Illinois","Spanish–American War recipients of the Medal of Honor","United States Navy personnel st


It appears that the main content is nested inside the div tag with the id content

# Reading In The Data

In [5]:
import concurrent.futures
import time

pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)

def read_data(filename):
    with open(filename) as f:
        data = f.read()
    return data

start = time.time()
filenames = ["wiki/{}".format(f) for f in os.listdir("wiki")]
content = pool.map(read_data, filenames)
content = list(content)

end = time.time()
print(end - start)
articles = [f.replace(".html", "").replace("wiki/", "") for f in filenames]

0.23751139640808105


After doing some profiling, it doesn't appear that threading makes a huge difference to performance. It may be because although files are opened, most of the task is offset by the overhead of creating new threads.

# Remove Extraneous Markup

In [6]:
from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
end = time.time()

print(end - start)

28.587507247924805


In [7]:
parsed[0]

'<div class="mw-body" id="content" role="main">\n<a id="top"></a>\n<div id="siteNotice"><!-- CentralNotice --></div>\n<div class="mw-indicators">\n</div>\n<h1 class="firstHeading" id="firstHeading" lang="en">Margo Reuten</h1>\n<div class="mw-body-content" id="bodyContent">\n<div id="siteSub">From Wikipedia, the free encyclopedia</div>\n<div id="contentSub"></div>\n<div class="mw-jump" id="jump-to-nav">\n\t\t\t\t\tJump to:\t\t\t\t\t<a href="#mw-head">navigation</a>, \t\t\t\t\t<a href="#p-search">search</a>\n</div>\n<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><table class="infobox biography vcard" style="width:22em">\n<tr>\n<th colspan="2" style="text-align:center;font-size:125%;font-weight:bold"><span class="fn">Margo Reuten</span></th>\n</tr>\n<tr>\n<th scope="row">Born</th>\n<td>abt. 1966<br/>\n<span class="birthplace"><a href="/wiki/Maasbracht" title="Maasbracht">Maasbracht</a></span></td>\n</tr>\n<tr>\n<th scope="row">Nationality</th>\n<td class="category"><

This operation is quite slow and CPU-intensive. It looks like using as many processes are there are available processors speeds things up.

# Finding common tags

In [8]:
from bs4 import BeautifulSoup

def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 0
        tags[tag.name] += 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
tags = pool.map(count_tags, parsed)
tags = list(tags)

tag_counts = {}
for tag in tags:
    for k,v in tag.items():
        if k not in tag_counts:
            tag_counts[k] = 0
        tag_counts[k] += v
end = time.time()

print(end - start)
tag_counts

15.071829080581665


{'a': 161065,
 'abbr': 3665,
 'annotation': 2,
 'area': 39,
 'audio': 2,
 'b': 14455,
 'bdi': 4,
 'big': 75,
 'blockquote': 58,
 'br': 4986,
 'caption': 200,
 'center': 64,
 'cite': 3563,
 'code': 108,
 'dd': 1376,
 'del': 2,
 'div': 28581,
 'dl': 457,
 'dt': 334,
 'font': 40,
 'h1': 999,
 'h2': 4045,
 'h3': 777,
 'h4': 117,
 'h5': 4,
 'h6': 1,
 'hr': 51,
 'i': 18246,
 'img': 6701,
 'li': 85779,
 'map': 2,
 'math': 2,
 'mo': 2,
 'mrow': 2,
 'mstyle': 2,
 'noscript': 999,
 'ol': 858,
 'p': 7998,
 'pre': 1,
 'q': 76,
 'rb': 16,
 'rp': 32,
 'rt': 16,
 'ruby': 16,
 's': 10,
 'samp': 2,
 'semantics': 2,
 'small': 3272,
 'source': 2,
 'span': 67350,
 'strong': 599,
 'sub': 151,
 'sup': 11157,
 'table': 4010,
 'td': 57673,
 'th': 14472,
 'tr': 27300,
 'u': 51,
 'ul': 10972,
 'wbr': 85}

Based on our findings, it looks like there are quite a few td, a, li, and span tags. This indicates that articles tend to have lots of links, along with lists and tables. Links are the most numerous tag, which indicates how interconnected articles on Wikipedia are.

# Finding common words

In [9]:
from bs4 import BeautifulSoup
from collections import Counter
import re

def count_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    return Counter(words).most_common(10)

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(count_words, parsed)
words = list(words)

word_counts = {}
for wc in words:
    for word, count in wc:
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1
end = time.time()

print(end - start)
word_counts

15.27911376953125


{'cordray': 1,
 'indigenous': 1,
 'fifth': 2,
 'television': 21,
 '邵逸夫医院': 1,
 '1930s': 1,
 'framing': 1,
 'states': 50,
 'batak': 1,
 'colliery': 1,
 'techniques': 1,
 'lions': 1,
 'photography': 1,
 'djougou': 1,
 'cardiff': 1,
 'sloane': 1,
 'cotton': 1,
 'slovene': 1,
 'cabrini': 1,
 'brooks': 1,
 'checa': 1,
 'fortress': 1,
 'singing': 2,
 'kjetil': 1,
 'pedro': 1,
 'reservoir': 1,
 'month': 1,
 'battles': 1,
 'stratagem': 1,
 'trinity': 1,
 'publications': 2,
 'hasbro': 1,
 'malaysia': 2,
 'grenfell': 1,
 'natasha': 1,
 'woody': 1,
 'shioda': 1,
 'peene': 1,
 'schauspielhaus': 1,
 'infusella': 1,
 '00634': 1,
 'kamakhyanagar': 1,
 'airways': 1,
 'kistenuten': 1,
 'furubira': 1,
 'birth': 9,
 'brooke': 2,
 'concord': 1,
 'cavite': 1,
 'haute': 1,
 'automated': 1,
 'varese': 1,
 'turned': 1,
 'municipal': 5,
 'skaggs': 1,
 'blues': 6,
 'bulgarian': 3,
 'https': 2,
 'hokkaido': 2,
 'kenny': 4,
 'processes': 1,
 'intelligence': 1,
 'aspect': 1,
 'perez': 1,
 'index': 5,
 'shadow': 1,