# Tutorial 4: Extract Text Section-wise 

## Retrieve earlier saved soup object 
### If you don't have this, follow Tutorials 1 - 3

In [29]:
%store -r soup

## Imports
### (In addition to BeautifulSoup, we also import the NavigableString, Tag, Comment and Stylesheet classes)

In [30]:
from bs4 import BeautifulSoup, NavigableString, Comment, Stylesheet, Tag

## Pattern-based segmenter for all h2 sections

In [36]:
section_offsets_dict = {"section":[],"start":[],"end":[], "text_content":[]}

import re
# for body_text in soup.find_all("div", {"id":["bodyContent"]}):
for body_text in soup.find_all("div", {"id":["mw-content-text"]}):
    body_text_txt = body_text.get_text(strip=True)
    # Clean up text content if you like.
    # body_text_txt=re.sub(r'\s+', ' ', body_text_txt.strip())
    len_text = len(body_text_txt)
    sections = body_text.find_all("h2")
    nbr_sections = len(sections) + 1
    section_offsets_dict["section"] = [0] * nbr_sections
    section_offsets_dict["start"] = [0] * nbr_sections
    section_offsets_dict["end"] = [0] * nbr_sections
    print(f"{nbr_sections} section headers found")
    
    for idx, sect in enumerate(sections):
        # Get each section text ...
        sect_txt = sect.get_text(strip=True)
        # Clean it up a bit
        sect_txt = re.sub(r'\s+', ' ', sect_txt.strip())
        # Match based on simple find
        if sect_txt in body_text_txt:
            # Capture text above for the first section (call it unlabelled if you wish)
            if idx == 0:
                section_offsets_dict["section"][idx]="No Section"
                section_offsets_dict["start"][idx]=0                
            # Add this section's text and start offset
            section_offsets_dict["section"][idx+1] = sect_txt
            section_offsets_dict["start"][idx+1] = body_text_txt.find(sect_txt)
    # End offsets treated separately - look to the next start offset
    for idx in range(nbr_sections - 1):
        # Fill in the penultimate end offset
        section_offsets_dict["end"][idx] = section_offsets_dict["start"][idx+1]
    # Fill in the last end offset
    section_offsets_dict["end"][nbr_sections - 1] = len_text
    
    # Now, start slicing the text content based on each offset pair
    for i in range(len(section_offsets_dict["start"])):
        start_pos = section_offsets_dict["start"][i]
        end_pos = section_offsets_dict["end"][i]
        section_text = body_text_txt[start_pos:end_pos].strip()
        section_offsets_dict["text_content"].append(section_text)

# Optional - add to a pandas dataframe
import pandas as pd
section_frame = pd.DataFrame(section_offsets_dict)

9 section headers found


## Check out segmented output

In [32]:
pd.set_option('display.max_colwidth', None)
section_frame.head(10)

Unnamed: 0,section,start,end,text_content
0,No Section,0,497,"Swiss professional golferJoel GirrbachPersonal informationBorn(1993-07-19)19 July 1993(age 32)Kreuzlingen,Thurgau, SwitzerlandHeight1.81 m (5 ft 11 in)Weight76 kg (168 lb)Sporting nationalitySwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1Joel Girrbach(born 19 July 1993) is a Swissprofessional golferwho plays on theEuropean Tour. He won the 2017Swiss Challengeon theChallenge Tour.[1]"
1,Early life and amateur career,497,735,"Early life and amateur career[edit]Girrbach started playing golf at the age of eight. He was playing off scratch by the time he was 16, and won the 2011 Swiss Junior Championship. He was educated at Berufsbildungszentrum Weinfeldenand.[1]"
2,Professional career,735,1405,"Professional career[edit]Girrbach turned professional in 2015 and joined theChallenge Tour. In 2016, he was runner-up at theRed Sea Egyptian Challengeand in 2017 he won his first title, theSwiss Challengeat Golf Sempach by two strokes.[2]In 2018, he was runner-up at thePrague Golf Challengeand theHopps Open de Provence.[3]In 2023, Girrbach was runner-up atThe Challengein India and theHainan Openin China, andgraduatedto the European Tour for 2024. In his rookie season, he recorded several top-10 finishes, including a T-8 at theBahrain Championshipand a T-3 at theVolvo China Open.[4]On the back of these results, he qualified for the2024 Summer Olympicsin Paris.[5]"
3,Amateur wins,1405,1527,"Amateur wins[edit]2011 Swiss Junior Championship2013 Ticino Championship, Finnish Amateur2014 Leman ChampionshipSource:[6]"
4,Professional wins (1),1527,1702,Professional wins (1)[edit]Challenge Tour wins (1)[edit]No.DateTournamentWinning scoreMargin ofvictoryRunner-up14 Jun2017Swiss Challenge−17 (68-67-64-68=267)2 strokesCraig Lee
5,Team appearances,1702,1808,Team appearances[edit]AmateurDuke of York Young Champions Trophy(representing Switzerland): 2011Source:[6]
6,See also,1808,1851,See also[edit]2023 Challenge Tour graduates
7,References,1851,2498,"References[edit]^ab""Joel Girrbach Bio"". PGA European Tour. Retrieved27 June2024.^""Girrbach going in search of Swiss Challenge double"". PGA European Tour. 19 September 2023. Retrieved27 June2024.^""Joel Girrbach"". Official World Golf Ranking. Retrieved27 June2024.^Ballengee, Ryan (5 May 2024).""2024 Volvo China Open final results: Prize money payout, DP World Tour leaderboard, how much each golfer won"".Golf News Net. Retrieved27 June2024.^Ferguson, Doug (25 June 2024).""Dutch Olympic officials will deny golfers from men's competition in Paris"".ABC News. Retrieved27 June2024.^ab""Joel Girrbach"". World Amateur Golf Rankings. Retrieved27 June2024."
8,External links,2498,2728,"External links[edit]Official websiteJoel Girrbachat theEuropean Tourofficial siteJoel Girrbachat theOfficial World Golf Rankingofficial siteRetrieved from ""https://en.wikipedia.org/w/index.php?title=Joel_Girrbach&oldid=1252620522"""


## Pattern-based segmenter for all "section like" sections (h2 / h3 / h1s)

In [37]:
section_offsets_dict = {"section":[],"start":[],"end":[], "text_content":[]}

import re
# for body_text in soup.find_all("div", {"id":["bodyContent"]}):
for body_text in soup.find_all("div", {"id":["mw-content-text"]}):
    body_text_txt = body_text.get_text(strip=True)
    # Clean up text content if you like.
    # body_text_txt=re.sub(r'\s+', ' ', body_text_txt.strip())
    len_text = len(body_text_txt)
    sections = body_text.find_all(["h1","h2","h3"])
    nbr_sections = len(sections) + 1
    section_offsets_dict["section"] = [0] * nbr_sections
    section_offsets_dict["start"] = [0] * nbr_sections
    section_offsets_dict["end"] = [0] * nbr_sections
    print(f"{nbr_sections} section headers found")
    
    for idx, sect in enumerate(sections):
        # Get each section text ...
        sect_txt = sect.get_text(strip=True)
        # Clean it up a bit
        sect_txt = re.sub(r'\s+', ' ', sect_txt.strip())
        # Match based on simple find
        if sect_txt in body_text_txt:
            # Capture text above for the first section (call it unlabelled if you wish)
            if idx == 0:
                section_offsets_dict["section"][idx]="No Section"
                section_offsets_dict["start"][idx]=0                
            # Add this section's text and start offset
            section_offsets_dict["section"][idx+1] = sect_txt
            section_offsets_dict["start"][idx+1] = body_text_txt.find(sect_txt)
    # End offsets treated separately - look to the next start offset
    for idx in range(nbr_sections - 1):
        # Fill in the penultimate end offset
        section_offsets_dict["end"][idx] = section_offsets_dict["start"][idx+1]
    # Fill in the last end offset
    section_offsets_dict["end"][nbr_sections - 1] = len_text
    
    # Now, start slicing the text content based on each offset pair
    for i in range(len(section_offsets_dict["start"])):
        start_pos = section_offsets_dict["start"][i]
        end_pos = section_offsets_dict["end"][i]
        section_text = body_text_txt[start_pos:end_pos].strip()
        section_offsets_dict["text_content"].append(section_text)

# Optional - add to a pandas dataframe
import pandas as pd
section_frame_all_hs = pd.DataFrame(section_offsets_dict)


10 section headers found


### Observe new segments 

In [38]:
section_frame_all_hs

Unnamed: 0,section,start,end,text_content
0,No Section,0,497,"Swiss professional golferJoel GirrbachPersonal informationBorn(1993-07-19)19 July 1993(age 32)Kreuzlingen,Thurgau, SwitzerlandHeight1.81 m (5 ft 11 in)Weight76 kg (168 lb)Sporting nationalitySwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1Joel Girrbach(born 19 July 1993) is a Swissprofessional golferwho plays on theEuropean Tour. He won the 2017Swiss Challengeon theChallenge Tour.[1]"
1,Early life and amateur career,497,735,"Early life and amateur career[edit]Girrbach started playing golf at the age of eight. He was playing off scratch by the time he was 16, and won the 2011 Swiss Junior Championship. He was educated at Berufsbildungszentrum Weinfeldenand.[1]"
2,Professional career,735,1405,"Professional career[edit]Girrbach turned professional in 2015 and joined theChallenge Tour. In 2016, he was runner-up at theRed Sea Egyptian Challengeand in 2017 he won his first title, theSwiss Challengeat Golf Sempach by two strokes.[2]In 2018, he was runner-up at thePrague Golf Challengeand theHopps Open de Provence.[3]In 2023, Girrbach was runner-up atThe Challengein India and theHainan Openin China, andgraduatedto the European Tour for 2024. In his rookie season, he recorded several top-10 finishes, including a T-8 at theBahrain Championshipand a T-3 at theVolvo China Open.[4]On the back of these results, he qualified for the2024 Summer Olympicsin Paris.[5]"
3,Amateur wins,1405,1527,"Amateur wins[edit]2011 Swiss Junior Championship2013 Ticino Championship, Finnish Amateur2014 Leman ChampionshipSource:[6]"
4,Professional wins (1),1527,1554,Professional wins (1)[edit]
5,Challenge Tour wins (1),1554,1702,Challenge Tour wins (1)[edit]No.DateTournamentWinning scoreMargin ofvictoryRunner-up14 Jun2017Swiss Challenge−17 (68-67-64-68=267)2 strokesCraig Lee
6,Team appearances,1702,1808,Team appearances[edit]AmateurDuke of York Young Champions Trophy(representing Switzerland): 2011Source:[6]
7,See also,1808,1851,See also[edit]2023 Challenge Tour graduates
8,References,1851,2498,"References[edit]^ab""Joel Girrbach Bio"". PGA European Tour. Retrieved27 June2024.^""Girrbach going in search of Swiss Challenge double"". PGA European Tour. 19 September 2023. Retrieved27 June2024.^""Joel Girrbach"". Official World Golf Ranking. Retrieved27 June2024.^Ballengee, Ryan (5 May 2024).""2024 Volvo China Open final results: Prize money payout, DP World Tour leaderboard, how much each golfer won"".Golf News Net. Retrieved27 June2024.^Ferguson, Doug (25 June 2024).""Dutch Olympic officials will deny golfers from men's competition in Paris"".ABC News. Retrieved27 June2024.^ab""Joel Girrbach"". World Amateur Golf Rankings. Retrieved27 June2024."
9,External links,2498,2728,"External links[edit]Official websiteJoel Girrbachat theEuropean Tourofficial siteJoel Girrbachat theOfficial World Golf Rankingofficial siteRetrieved from ""https://en.wikipedia.org/w/index.php?title=Joel_Girrbach&oldid=1252620522"""


In [35]:
section_offsets_dict

{'section': ['No Section',
  'Early life and amateur career',
  'Professional career',
  'Amateur wins',
  'Professional wins (1)',
  'Challenge Tour wins (1)',
  'Team appearances',
  'See also',
  'References',
  'External links'],
 'start': [0, 497, 735, 1405, 1527, 1554, 1702, 1808, 1851, 2498],
 'end': [497, 735, 1405, 1527, 1554, 1702, 1808, 1851, 2498, 2728],
 'text_content': ['Swiss professional golferJoel GirrbachPersonal informationBorn(1993-07-19)19 July 1993(age\xa032)Kreuzlingen,Thurgau, SwitzerlandHeight1.81\xa0m (5\xa0ft 11\xa0in)Weight76\xa0kg (168\xa0lb)Sporting nationalitySwitzerlandCareerTurned professional2015Current tourEuropean TourFormer toursChallenge TourPro Golf TourProfessional wins1Number of wins by tourChallenge Tour1Joel Girrbach(born 19 July 1993) is a Swissprofessional golferwho plays on theEuropean Tour. He won the 2017Swiss Challengeon theChallenge Tour.[1]',
  'Early life and amateur career[edit]Girrbach started playing golf at the age of eight. He wa