# Tutorial 5: Create a table with granular elements from Wikipedia page

## Retrieve earlier saved soup object 
### If you don't have this, follow Tutorials 1 - 3

In [1]:
%store -r soup

## Imports
### (In addition to BeautifulSoup, we also import the NavigableString, Comment and Stylesheet classes)

In [2]:
from bs4 import BeautifulSoup, NavigableString, Comment, Stylesheet, Tag

## Helper function to walk and annotate the content soup

In [3]:
def walk_content(content_list):
    for each_item in content_list:
        text = each_item.text.strip()
        if text:
            content_dict["text"].append(text)
            content_dict["type"].append(str(type(each_item)))
            if isinstance(each_item,(NavigableString, Comment, Stylesheet)):
                content_dict["attrs"].append({"class":[]})
                content_dict["name"].append(str(type(each_item)))
                continue
            else:
                attrs = each_item.attrs
                content_dict["attrs"].append(attrs)
                content_dict["name"].append(each_item.name)
                if each_item.children:
                    walk_content(list(each_item.children))

## Call walk function and create a section variable
#### Additionally, also create other variables to hold attributes and tag names and types

### Create a dict of objects to populate your table

In [4]:
content_dict = {"type":[],"text":[],"attrs":[],"name":[]}

url=""
text_title=""

### Parse soup object

#### A canonical link refers to the standard URL that the page is linked to

In [5]:
for link in soup.find_all("link",{"rel":["canonical"]}):
    url=link.attrs["href"]

    

### Extract Title
#### The title of the Wikipedia page is identified by a class of mw-body-header and a h1 tag

In [6]:
for h1 in soup.find("header",{"class":"mw-body-header"}).find("h1"):
    text_title = h1.get_text()

### Extract Content
#### The content of the page is taken from a div with id bodyContent

In [7]:
for body in soup.find_all("div", {"id":["bodyContent"]}):
    content = body.find("div",{"id":"mw-content-text"})
    content_list = list(content.children)
    walk_content(content_list)
    # Remove text from code, style and abbr tags and delete those entries
    for idx, item in enumerate(content_dict["name"]):
        if item in ["code","style","abbr"]:
            content_dict["text"][idx]=""
            content_dict["type"][idx]=""
    for i in range(len(content_dict["type"])-1, -1, -1):
        if not content_dict["type"][i]:  # empty check
            del content_dict["type"][i]
            del content_dict["name"][i]
            del content_dict["text"][i]
            del content_dict["attrs"][i]
    # Categorise all elements based on sections found in h2 headers along the article
    content_dict["section"]=[""] * len(content_dict["name"])
    current_section = "No Section"
    for idx, item in enumerate(content_dict["name"]):
        if item == "h2":
            current_section = content_dict["text"][idx]
        content_dict["section"][idx]=current_section
    # Append URL and title
    content_dict["url"]=[url]*len(content_dict["name"])
    content_dict["title"]=[text_title]*len(content_dict["name"])
    
                

### Convert to a pandas dataframe

In [8]:
import pandas as pd
content_frame = pd.DataFrame(content_dict)

### Quick check: scan through some records

In [9]:
pd.set_option('display.max_colwidth', 400)
content_frame.head(5)

Unnamed: 0,type,text,attrs,name,section,url,title
0,<class 'bs4.element.Tag'>,"Swiss professional golfer\n\n\nJoel GirrbachPersonal informationBorn (1993-07-19) 19 July 1993 (age 32)Kreuzlingen, Thurgau, SwitzerlandHeight1.81 m (5 ft 11 in)Weight76 kg (168 lb)Sporting nationality SwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1\nJoel Girrbach (born 19 July 1993)...","{'class': ['mw-content-ltr', 'mw-parser-output'], 'dir': 'ltr', 'lang': 'en'}",div,No Section,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
1,<class 'bs4.element.Tag'>,Swiss professional golfer,"{'class': ['shortdescription', 'nomobile', 'noexcerpt', 'noprint', 'searchaux'], 'style': 'display:none'}",div,No Section,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
2,<class 'bs4.element.NavigableString'>,Swiss professional golfer,{'class': []},<class 'bs4.element.NavigableString'>,No Section,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
3,<class 'bs4.element.Tag'>,"Joel GirrbachPersonal informationBorn (1993-07-19) 19 July 1993 (age 32)Kreuzlingen, Thurgau, SwitzerlandHeight1.81 m (5 ft 11 in)Weight76 kg (168 lb)Sporting nationality SwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1","{'class': ['infobox', 'vcard'], 'style': 'width:25em'}",table,No Section,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
4,<class 'bs4.element.Tag'>,"Joel GirrbachPersonal informationBorn (1993-07-19) 19 July 1993 (age 32)Kreuzlingen, Thurgau, SwitzerlandHeight1.81 m (5 ft 11 in)Weight76 kg (168 lb)Sporting nationality SwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1",{},tbody,No Section,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach


## Create table of all sections with paragraphs

In [10]:
content_frame.section[content_frame["name"]=="p"].value_counts()

section
Professional career              3
Team appearances                 2
No Section                       1
Early life and amateur career    1
Amateur wins                     1
Name: count, dtype: int64

### List out content relevant to a section

In [13]:
pd.set_option('display.max_colwidth', None)
content_frame[(content_frame["section"]=="Professional career") & (content_frame["name"]=="p")]

Unnamed: 0,type,text,attrs,name,section,url,title
144,<class 'bs4.element.Tag'>,"Girrbach turned professional in 2015 and joined the Challenge Tour. In 2016, he was runner-up at the Red Sea Egyptian Challenge and in 2017 he won his first title, the Swiss Challenge at Golf Sempach by two strokes.[2]",{},p,Professional career,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
162,<class 'bs4.element.Tag'>,"In 2018, he was runner-up at the Prague Golf Challenge and the Hopps Open de Provence.[3]",{},p,Professional career,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach
177,<class 'bs4.element.Tag'>,"In 2023, Girrbach was runner-up at The Challenge in India and the Hainan Open in China, and graduated to the European Tour for 2024. In his rookie season, he recorded several top-10 finishes, including a T-8 at the Bahrain Championship and a T-3 at the Volvo China Open.[4] On the back of these results, he qualified for the 2024 Summer Olympics in Paris.[5]",{},p,Professional career,https://en.wikipedia.org/wiki/Joel_Girrbach,Joel Girrbach


## Create cross tab of tags with sections

In [14]:
pd.crosstab(content_frame["section"], content_frame["name"])

name,<class 'bs4.element.NavigableString'>,a,b,cite,div,h2,h3,i,li,ol,p,span,sup,table,tbody,td,th,tr,ul
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Amateur wins,11,2,0,0,1,1,0,0,3,0,1,6,1,0,0,0,0,0,1
Early life and amateur career,8,2,0,0,1,1,0,0,0,0,1,6,1,0,0,0,0,0,0
External links,16,7,0,0,1,1,0,0,3,0,0,6,0,0,0,0,0,0,1
No Section,45,11,9,0,4,0,0,0,0,0,1,6,1,1,1,9,13,13,0
Professional career,42,16,0,0,1,1,0,0,0,0,3,12,4,0,0,0,0,0,0
Professional wins (1),22,5,0,0,2,1,1,0,0,0,0,8,0,1,1,6,6,2,0
References,54,15,8,6,3,1,0,6,6,1,0,28,4,0,0,0,0,0,0
See also,5,2,0,0,1,1,0,0,1,0,0,4,0,0,0,0,0,0,1
Team appearances,11,3,1,0,1,1,0,0,1,0,2,6,1,0,0,0,0,0,1
