# Review the Folder wiki

In [1]:
import os

# os.listdir('wiki') - use this to list all files in wiki folder

# Count up and display the number of files in the wiki folder

count_wiki_folder = len(os.listdir('wiki'))

count_wiki_folder

999

In [2]:
with open("wiki/Zehut.html", encoding="utf8") as f:
    print(f.read())

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Zehut - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Zehut","wgTitle":"Zehut","wgCurRevisionId":765862038,"wgRevisionId":765862038,"wgArticleId":49635088,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with a promotional tone from October 2016","All articles with a promotional tone","Pages using deprecated image syntax","All articles with unsourced statements","Articles with unsourced statements from February 2017","2015 establishments in Israel","Liberal parties in Israel","Political parties established in 2015","Zionist political parti

It appears that the main content is nested inside the div tag with the id content.

# Reading in the Data

In [3]:
import concurrent.futures
import time

pool = concurrent.futures.ThreadPoolExecutor(max_workers = 5)

def read_data(filename):
    with open(filename, encoding="utf8") as f:
        data = f.read()
    return data

# Read all of the files into list called content
file_list = ["wiki/{}".format(f) for f in os.listdir('wiki')]
content = list(pool.map(read_data, file_list))

# Create a list called articles that contains the article names
articles = [f.replace('wiki/', '').replace('.html', '') for f in file_list]

articles[:10]

['Cristian_Berdeja',
 'Bob_Bass',
 'Louis_Vivet',
 '104th_Logistic_Support_Brigade_(United_Kingdom)',
 'Zehut',
 'Christine_Gardner',
 'I%27m_Walking_Behind_You',
 'Bill_Widenhouse',
 'Friendship,_Wake_County,_North_Carolina',
 'Harry_Lynde_Bradley']

In [4]:
# Test max_workers

time_run = {}

for i in range(8):
    test_pool = concurrent.futures.ThreadPoolExecutor(max_workers = (i+1))
    start = time.time()
    file_list_test = ["wiki/{}".format(f) for f in os.listdir('wiki')]
    content_test = list(pool.map(read_data, file_list))
    time_run[i+1] = time.time() - start
    
time_run

{1: 1.096341848373413,
 2: 1.0523681640625,
 3: 1.034379005432129,
 4: 1.0583646297454834,
 5: 1.1493098735809326,
 6: 1.071357011795044,
 7: 1.2092738151550293,
 8: 1.211272954940796}

time_run showed little improvement with numbers of thread, likely result of offset from creating thread.

# Remove the extraneous markup

In [18]:
from bs4 import BeautifulSoup
import parse # from parse.py

#def parse_html(html):
    #soup = BeautifulSoup(open(html), 'html.parser')
    #return str(soup.find_all("div", id="content")[0])
# function was saved in parse.py to work around Windows multiprocessing bug in Jupyter

time_parse = {}
if __name__ == '__main__':
    for i in range(6):
        start = time.time()
        pool = concurrent.futures.ProcessPoolExecutor(max_workers = (i+1))
        parsed = list(pool.map(parse.parse_html, content))
        time_parse[i+1] = time.time() - start  

if __name__ == '__main__':
    pool = concurrent.futures.ProcessPoolExecutor(max_workers = (i+1))
    parsed = list(pool.map(parse.parse_html, content))
    
time_parse

{1: 50.452088356018066,
 2: 25.340067148208618,
 3: 17.585516452789307,
 4: 15.815162420272827,
 5: 15.450891971588135,
 6: 14.01800012588501}

This operation is quite slow and CPU-intensive. It looks like using as many processes are there are available processors speeds things up. However, performance peaks around 5 max_workers. More processes would have significantly impact other computer operations.

# Finding Common Tags

In [27]:
from bs4 import BeautifulSoup
import tagCount

if __name__ == '__main__':
    pool_tag = concurrent.futures.ProcessPoolExecutor(max_workers = 4)
    tag_list = list(pool_tag.map(tagCount.tag_count, parsed))

tag_dict = {}

for tag in tag_list:
    for k, v in tag.items():
        if k not in tag_dict:
            tag_dict[k] = 0
        tag_dict[k] += v
        
sort_tag = sorted(tag_dict.items(), key=lambda kv: kv[1], reverse=True)

tag = dict(sort_tag)

tag

{'a': 161065,
 'li': 85779,
 'span': 67350,
 'td': 57673,
 'div': 28581,
 'tr': 27300,
 'i': 18246,
 'th': 14472,
 'b': 14455,
 'sup': 11157,
 'ul': 10972,
 'p': 7998,
 'img': 6701,
 'br': 4986,
 'h2': 4045,
 'table': 4010,
 'abbr': 3665,
 'cite': 3563,
 'small': 3272,
 'dd': 1376,
 'h1': 999,
 'noscript': 999,
 'ol': 858,
 'h3': 777,
 'strong': 599,
 'dl': 457,
 'dt': 334,
 'caption': 200,
 'sub': 151,
 'h4': 117,
 'code': 108,
 'wbr': 85,
 'q': 76,
 'big': 75,
 'center': 64,
 'blockquote': 58,
 'hr': 51,
 'u': 51,
 'font': 40,
 'area': 39,
 'rp': 32,
 'ruby': 16,
 'rb': 16,
 'rt': 16,
 's': 10,
 'bdi': 4,
 'h5': 4,
 'audio': 2,
 'source': 2,
 'del': 2,
 'map': 2,
 'math': 2,
 'semantics': 2,
 'mrow': 2,
 'mstyle': 2,
 'mo': 2,
 'annotation': 2,
 'samp': 2,
 'pre': 1,
 'h6': 1}

Based on our findings, it looks like there are quite a few td, a, li, and span tags. This indicates that articles tend to have lots of links, along with lists and tables. Links are the most numerous tag, which indicates how interconnected articles on Wikipedia are.

# Find common words

In [40]:
from bs4 import BeautifulSoup
import wordCount

start = time.time()
if __name__ == '__main__':
    pool_word = concurrent.futures.ProcessPoolExecutor(max_workers = 4)
    word_list = list(pool_word.map(wordCount.word_count, parsed))

word_dict = {}

for word in word_list:
    for each in word:
        if each[0] not in word_dict:
            word_dict[each[0]] = 0
        word_dict[each[0]] += each[1]

print(time.time() - start)
sort_word = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)

words = dict(sort_word)

words

7.506750822067261


{'retrieved': 1733,
 'wikipedia': 1310,
 'united': 781,
 'production': 686,
 'university': 635,
 'articles': 594,
 'italian': 594,
 'italy': 536,
 'county': 533,
 'district': 485,
 'school': 483,
 'species': 420,
 'bellator': 367,
 'national': 358,
 'states': 351,
 'article': 336,
 'station': 335,
 'party': 326,
 'league': 317,
 'french': 298,
 'spanish': 296,
 'texas': 291,
 'women': 289,
 'church': 288,
 'august': 287,
 'state': 285,
 'march': 281,
 'campus': 280,
 'world': 278,
 'german': 278,
 'october': 278,
 'south': 275,
 'yards': 275,
 'season': 269,
 'football': 259,
 'november': 259,
 'family': 252,
 'american': 248,
 'album': 247,
 'september': 245,
 'music': 244,
 'which': 243,
 'february': 242,
 'series': 236,
 'december': 225,
 'north': 222,
 'original': 221,
 'vikings': 207,
 'avengers': 203,
 'their': 202,
 'company': 202,
 'building': 198,
 'academy': 198,
 'image': 197,
 'archived': 196,
 'saint': 191,
 'alabama': 190,
 'college': 187,
 'january': 184,
 'youtube': 183