# Tutorial 5: Create a table with granular elements from Wikipedia page

## Retrieve earlier saved soup object 
### If you don't have this, follow Tutorials 1 - 3

In [17]:
%store -r soup

## Imports
### (In addition to BeautifulSoup, we also import the NavigableString, Comment and Stylesheet classes)

In [18]:
from bs4 import BeautifulSoup, NavigableString, Comment, Stylesheet, Tag

## Helper function to walk and annotate the content soup

In [19]:
def walk_content(content_list):
    for each_item in content_list:
        text = each_item.text.strip()
        if text:
            content_dict["text"].append(text)
            content_dict["type"].append(str(type(each_item)))
            if isinstance(each_item,(NavigableString, Comment, Stylesheet)):
                content_dict["attrs"].append({"class":[]})
                content_dict["name"].append(str(type(each_item)))
                continue
            else:
                attrs = each_item.attrs
                content_dict["attrs"].append(attrs)
                content_dict["name"].append(each_item.name)
                if each_item.children:
                    walk_content(list(each_item.children))

## Call walk function and create a section variable
#### Additionally, also create other variables to hold attributes and tag names and types

### Create a dict of objects to populate your table

In [20]:
content_dict = {"type":[],"text":[],"attrs":[],"name":[]}

url=""
text_title=""

### Parse soup object

#### A canonical link refers to the standard URL that the page is linked to

In [21]:
for link in soup.find_all("link",{"rel":["canonical"]}):
    url=link.attrs["href"]

    

### Extract Title
#### The title of the Wikipedia page is identified by a class of mw-body-header and a h1 tag

In [22]:
for h1 in soup.find("header",{"class":"mw-body-header"}).find("h1"):
    text_title = h1.get_text()

### Extract Content
#### The content of the page is taken from a div with id bodyContent

In [23]:
for body in soup.find_all("div", {"id":["bodyContent"]}):
    content = body.find("div",{"id":"mw-content-text"})
    content_list = list(content.children)
    walk_content(content_list)
    # Remove text from code, style and abbr tags and delete those entries
    for idx, item in enumerate(content_dict["name"]):
        if item in ["code","style","abbr"]:
            content_dict["text"][idx]=""
            content_dict["type"][idx]=""
    for i in range(len(content_dict["type"])-1, -1, -1):
        if not content_dict["type"][i]:  # empty check
            del content_dict["type"][i]
            del content_dict["name"][i]
            del content_dict["text"][i]
            del content_dict["attrs"][i]
    # Categorise all elements based on sections found in h2 headers along the article
    content_dict["section"]=[""] * len(content_dict["name"])
    current_section = "No Section"
    for idx, item in enumerate(content_dict["name"]):
        if item == "h2":
            current_section = content_dict["text"][idx]
        content_dict["section"][idx]=current_section
    # Append URL and title
    content_dict["url"]=[url]*len(content_dict["name"])
    content_dict["title"]=[text_title]*len(content_dict["name"])
    
                

### Convert to a pandas dataframe

In [24]:
import pandas as pd
content_frame = pd.DataFrame(content_dict)

### Quick check: scan through some records

In [25]:
pd.set_option('display.max_colwidth', 400)
content_frame.head(5)

Unnamed: 0,type,text,attrs,name,section,url,title
0,<class 'bs4.element.Tag'>,"Serbian writer, historian, and priest\nPavle StamatovićПавле СтаматовићBorn(1805-04-11)11 April 1805Jakovo, Military FrontierDied14 September 1864(1864-09-14) (aged 59)Novi Sad, Austrian EmpireAlma materRoyal University of Pest\nPavle Stamatović (11 April 1805 – 14 September 1864) was a Serbian writer, historian, and archpriest. He chaired the delegation of South Slavs at the Prague Slavic Con...","{'class': ['mw-content-ltr', 'mw-parser-output'], 'dir': 'ltr', 'lang': 'en'}",div,No Section,https://en.wikipedia.org/wiki/Pavle_Stamatovi%C4%87,Pavle Stamatović
1,<class 'bs4.element.Tag'>,"Serbian writer, historian, and priest","{'class': ['shortdescription', 'nomobile', 'noexcerpt', 'noprint', 'searchaux'], 'style': 'display:none'}",div,No Section,https://en.wikipedia.org/wiki/Pavle_Stamatovi%C4%87,Pavle Stamatović
2,<class 'bs4.element.NavigableString'>,"Serbian writer, historian, and priest",{'class': []},<class 'bs4.element.NavigableString'>,No Section,https://en.wikipedia.org/wiki/Pavle_Stamatovi%C4%87,Pavle Stamatović
3,<class 'bs4.element.Tag'>,"Pavle StamatovićПавле СтаматовићBorn(1805-04-11)11 April 1805Jakovo, Military FrontierDied14 September 1864(1864-09-14) (aged 59)Novi Sad, Austrian EmpireAlma materRoyal University of Pest","{'class': ['infobox', 'biography', 'vcard']}",table,No Section,https://en.wikipedia.org/wiki/Pavle_Stamatovi%C4%87,Pavle Stamatović
4,<class 'bs4.element.Tag'>,"Pavle StamatovićПавле СтаматовићBorn(1805-04-11)11 April 1805Jakovo, Military FrontierDied14 September 1864(1864-09-14) (aged 59)Novi Sad, Austrian EmpireAlma materRoyal University of Pest",{},tbody,No Section,https://en.wikipedia.org/wiki/Pavle_Stamatovi%C4%87,Pavle Stamatović


## Create table of all sections with paragraphs

In [26]:
content_frame.section[content_frame["name"]=="p"].value_counts()

section
Biography     2
No Section    1
Name: count, dtype: int64

### List out content relevant to a section

In [27]:
pd.set_option('display.max_colwidth', None)
content_frame[(content_frame["section"]=="History") & (content_frame["name"]=="p")]

Unnamed: 0,type,text,attrs,name,section,url,title


## Create cross tab of tags with sections

In [28]:
pd.crosstab(content_frame["section"], content_frame["name"])

name,<class 'bs4.element.NavigableString'>,a,b,bdi,cite,div,h2,i,li,ol,p,small,span,sup,table,tbody,td,th,tr,ul
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Biography,72,19,0,0,0,1,1,11,0,0,2,0,15,6,0,0,0,0,0,0
No Section,32,9,1,0,0,6,0,0,0,0,1,1,5,1,1,1,3,4,4,0
References,53,22,6,2,6,7,1,6,9,1,0,0,23,0,1,1,2,3,3,2
