# ü•£ Beautiful Soup Tutorial ‚Äî Scraping Scikit‚ÄëLearn 

This notebook demonstrates how to scrape the Scikit-Learn Supervised Learning documentation page:
üîó https://scikit-learn.org/stable/supervised_learning.html

All extracted links are automatically converted to **full absolute URLs**.


## 1. Install Required Libraries

In [None]:
# !pip install requests beautifulsoup4 pandas

## 2. Import Libraries

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

## 3. Download the Webpage

In [2]:
base_url = 'https://scikit-learn.org/stable/supervised_learning.html'
response = requests.get(base_url)
response.status_code

200

In [4]:
print(response.text)


<!DOCTYPE html>


<html lang="en" data-content_root="./" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta property="og:title" content="1. Supervised learning" />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://scikit-learn/stable/supervised_learning.html" />
<meta property="og:site_name" content="scikit-learn" />
<meta property="og:description" content="Linear Models- Ordinary Least Squares, Ridge regression and classification, Lasso, Multi-task Lasso, Elastic-Net, Multi-task Elastic-Net, Least Angle Regression, LARS Lasso, Orthogonal Matching Pur..." />
<meta property="og:image" content="https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png" />
<meta property="og:image:alt" content="scikit-learn" />
<meta name="description" content="Linear Models- Ordinary Least Squares, Ridge regression a

## 4. Parse the HTML

In [5]:
soup = BeautifulSoup(response.text, 'lxml')
soup.prettify()[:800]

'<!DOCTYPE html>\n<html data-content_root="./" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <meta content="1. Supervised learning" property="og:title"/>\n  <meta content="website" property="og:type"/>\n  <meta content="https://scikit-learn/stable/supervised_learning.html" property="og:url"/>\n  <meta content="scikit-learn" property="og:site_name"/>\n  <meta content="Linear Models- Ordinary Least Squares, Ridge regression and classification, Lasso, Multi-task Lasso, Elastic-Net, Multi-task Elastic-Net, Least Angle Regression, LARS Lasso, Orthogonal Matching Pur..." property="og:description"/>\n  <meta content="https://scikit-learn.org/stable/_static/sc'

In [7]:
soup.find_all(['h1'])[0].text

'1. Supervised learning#'

## 5. Extract Headings (h1, h2, h3)

In [11]:
# headings = [(tag.name, tag.text.strip()) for tag in soup.find_all(['h1','h2','h3'])]
# df_headings = pd.DataFrame(headings, columns=['Heading Level','Title'])
# df_headings.head()

# headings = []
# for tag in soup.find_all(['h1','h2','h3']):
#     level = tag.name
#     title = tag.text.strip()
#     headings.append((level, title))
# print(headings)

headings = []
for tag in soup.find_all(['h1','h2','h3']):
    level = tag.name
    title = tag.text.strip()
    headings.append({'Heading Level': level, 'Title': title})

df_headings = pd.DataFrame(headings)
df_headings.head()


Unnamed: 0,Heading Level,Title
0,h1,1. Supervised learning#
1,h3,This Page


## 6. Extract Algorithm Links (Fixed URLs)

In [14]:
algo_links = []
for link in soup.find_all('a'):
    href = link.get('href')
    text = link.text.strip()
    if href and ('#classification' in href or '#regression' in href):
        full_href = urljoin(base_url, href)
        algo_links.append({'Name': text, 'Link': full_href})

df_algos = pd.DataFrame(algo_links)
df_algos.head()


Unnamed: 0,Name,Link
0,1.4.1. Classification,https://scikit-learn.org/stable/modules/svm.ht...
1,1.4.2. Regression,https://scikit-learn.org/stable/modules/svm.ht...
2,1.5.1. Classification,https://scikit-learn.org/stable/modules/sgd.ht...
3,1.5.2. Regression,https://scikit-learn.org/stable/modules/sgd.ht...
4,1.10.1. Classification,https://scikit-learn.org/stable/modules/tree.h...


## 7. Extract Documentation Links (Absolute URLs)

In [16]:
doc_links = []
for link in soup.find_all('a'):
    href = link.get('href')
    text = link.text.strip()
    if href:
        full_href = urljoin(base_url, href)
        if full_href.startswith('https://scikit-learn.org'):
            doc_links.append({'Text': text, 'URL': full_href})

df_docs = pd.DataFrame(doc_links)
df_docs.head()

Unnamed: 0,Text,URL
0,Skip to main content,https://scikit-learn.org/stable/supervised_lea...
1,,https://scikit-learn.org/stable/index.html
2,Install,https://scikit-learn.org/stable/install.html
3,User Guide,https://scikit-learn.org/stable/user_guide.html
4,API,https://scikit-learn.org/stable/api/index.html


## 8. Extract Sidebar Navigation (Absolute URLs)

In [23]:
# selectÊñπÊ≥ïÂèØ‰ª•ÈÄöËøácssÈÄâÊã©Âô®Êù•Êü•ÊâæÂÖ∑‰ΩìÁöÑÂÖÉÁ¥†
# sidebar = soup.select('.toctree-l1 a')
# sidebar_items = []
# for item in sidebar:
#     text = item.text.strip()
#     href = item.get('href')
#     full_url = urljoin(base_url, href)
#     sidebar_items.append({'Section': text, 'URL': full_url})
# df_sidebar = pd.DataFrame(sidebar_items)
# df_sidebar.head()

sidebar = soup.select('.sidebar-primary-item a') # ‰ΩøÁî®CSSÈÄâÊã©Âô®ÈÄâÊã©‰æßËæπÊ†èÈìæÊé•,ÂåÖÊã¨ÊâÄÊúâÂ±ÇÁ∫ß,ÊâæÂá∫ÊâÄÊúâliÊ†áÁ≠æ‰∏ãÁöÑaÊ†áÁ≠æ
sidebar_items = []
for item in sidebar:
    href = item.get('href')
    text = item.text.strip()
    if href:
        full_href = urljoin(base_url, href)
        sidebar_items.append({'Text':text, 'URL':full_href})

df_sidebar = pd.DataFrame(sidebar_items)
df_sidebar.head()

Unnamed: 0,Text,URL
0,1. Supervised learning,https://scikit-learn.org/stable/supervised_lea...
1,1.1. Linear Models,https://scikit-learn.org/stable/modules/linear...
2,1.2. Linear and Quadratic Discriminant Analysis,https://scikit-learn.org/stable/modules/lda_qd...
3,1.3. Kernel ridge regression,https://scikit-learn.org/stable/modules/kernel...
4,1.4. Support Vector Machines,https://scikit-learn.org/stable/modules/svm.html


## 9. Save Output Files

In [24]:
# df_headings.to_csv('sklearn_headings.csv', index=False)
# df_algos.to_csv('sklearn_algorithms.csv', index=False)
# df_sidebar.to_csv('sklearn_sidebar.csv', index=False)
# 'Saved: sklearn_headings.csv, sklearn_algorithms.csv, sklearn_sidebar.csv'

df_headings.to_csv('./sklearn_headings.csv', index=False)
df_algos.to_csv('./sklearn_algorithms.csv', index=False)
df_sidebar.to_csv('./sklearn_sidebar.csv', index=False)
print("Saved: sklearn_headings.csv, sklearn_algorithms.csv, sklearn_sidebar.csv")


Saved: sklearn_headings.csv, sklearn_algorithms.csv, sklearn_sidebar.csv
