# # Webscraping - Collecting data from internet

In [2]:
from warnings import filterwarnings

filterwarnings("ignore")

# Get the HTML content of websites

In [3]:
url = "https://en.wikipedia.org/wiki/Ratan_Tata"
print(url)

https://en.wikipedia.org/wiki/Ratan_Tata


In [4]:
import requests

In [5]:
response = requests.get(url)
response

<Response [200]>

In [6]:
print(response.content)

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Ratan Tata - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-f

# Use Beautiful Soup library to find the particular elements

In [7]:
from bs4 import BeautifulSoup

In [8]:
soup = BeautifulSoup(response.content)

In [9]:
type(soup)

bs4.BeautifulSoup

In [10]:
soup

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Ratan Tata - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature

In [11]:
type(soup)

bs4.BeautifulSoup

In [12]:
title = soup.find("title")
title

<title>Ratan Tata - Wikipedia</title>

In [13]:
title.text

'Ratan Tata - Wikipedia'

## Get H1 tag with particular class

In [14]:
h1 = soup.find("h1", class_="firstHeading")
h1

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Ratan Tata</span></h1>

In [15]:
h1.text

'Ratan Tata'

## Get subheadings

In [29]:
subheading = soup.find_all("div", class_="mw-heading")
subheading

[<div class="mw-heading mw-heading2"><h2 id="Early_life_and_education">Early life and education</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Ratan_Tata&amp;action=edit&amp;section=1" title="Edit section: Early life and education"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div>,
 <div class="mw-heading mw-heading2"><h2 id="Career">Career</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Ratan_Tata&amp;action=edit&amp;section=2" title="Edit section: Career"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div>,
 <div class="mw-heading mw-heading3"><h3 id="Early_years">Early years</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Ratan_Tata&amp;action=edit&amp;section=3" title="Edit section: Early years"><span>edit</span></a><span class="mw-editsection-bracket">

In [31]:
subheading[0].text

'Early life and education[edit]'

In [32]:
subheading_text = [tag.text for tag in subheading]
subheading_text

['Early life and education[edit]',
 'Career[edit]',
 'Early years[edit]',
 'Later years[edit]',
 'Philanthropic Contributions and Endowments[edit]',
 '1984 Anti-Sikh Pogrom victims[edit]',
 'University of New South Wales[edit]',
 'University of California[edit]',
 'Tata Education and Development Trust[edit]',
 'Executive center at Harvard Business School[edit]',
 'Tata Innovation Center at Cornell Tech[edit]',
 'Indian Institute of Technology[edit]',
 'Indian Centre for Neuroscience[edit]',
 'MIT Tata Center of Technology and Design[edit]',
 'Cornell University[edit]',
 'Contributions to Taj Hotel Staff Victims[edit]',
 'Board memberships and affiliations[edit]',
 'Personal life and death[edit]',
 'Honours and awards[edit]',
 'In popular culture[edit]',
 'See also[edit]',
 'References[edit]',
 'Bibliography[edit]',
 'External links[edit]']

## Getting the paragraphs

In [34]:
para = soup.find_all("p")
para[0:3]

[<p class="mw-empty-elt">
 </p>,
 <p><b>Ratan Naval Tata</b><sup class="reference" id="cite_ref-3"><a href="#cite_note-3"><span class="cite-bracket">[</span>a<span class="cite-bracket">]</span></a></sup> (28 December 1937 – 9 October 2024) was an  Indian industrialist and philanthropist. He served as the chairman of <a href="/wiki/Tata_Group" title="Tata Group">Tata Group</a> and <a href="/wiki/Tata_Sons" title="Tata Sons">Tata Sons</a> from 1991 to 2012 and he held the position of interim chairman from October 2016 to February 2017.<sup class="reference" id="cite_ref-4"><a href="#cite_note-4"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-5"><a href="#cite_note-5"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> In 2000, he received the <a href="/wiki/Padma_Bhushan" title="Padma Bhushan">Padma Bhushan</a>, the third highest civilian honour in India, followed by the <a href="/wiki/Pad

In [36]:
para_text = [tag.text for tag in para]
para_text[0:2]

['\n',
 "Ratan Naval Tata[a] (28 December 1937 – 9 October 2024) was an  Indian industrialist and philanthropist. He served as the chairman of Tata Group and Tata Sons from 1991 to 2012 and he held the position of interim chairman from October 2016 to February 2017.[3][4] In 2000, he received the Padma Bhushan, the third highest civilian honour in India, followed by the Padma Vibhushan, the country's second highest civilian honour, in 2008.[5]\n"]

In [37]:
para_str = "\n\n".join(para_text)
print(para_str[0:100])




Ratan Naval Tata[a] (28 December 1937 – 9 October 2024) was an  Indian industrialist and philanth


# Save the file

In [38]:
with open("RatanTata.txt", "w", encoding="utf-8") as f:
    f.write(para_str)

## Get image links

In [42]:
a_tags = soup.find_all("a", class_="mw-file-description")
a_tags

[<a class="mw-file-description" href="/wiki/File:Ratan_Tata_2011_(The_TCS_Story_Launch_-_CII)_(cropped).jpg"><img class="mw-file-element" data-file-height="1061" data-file-width="965" decoding="async" height="275" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg/250px-Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg/500px-Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg 1.5x" width="250"/></a>,
 <a class="mw-file-description" href="/wiki/File:Shri_Ratan_Tata_meeting_the_Union_Minister_for_Commerce_%26_Industry_and_Textiles,_Shri_Anand_Sharma,_in_New_Delhi_on_December_22,_2011.jpg"><img class="mw-file-element" data-file-height="1447" data-file-width="2200" decoding="async" height="164" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/42/Shri_Ratan

In [43]:
a_tags[0]

<a class="mw-file-description" href="/wiki/File:Ratan_Tata_2011_(The_TCS_Story_Launch_-_CII)_(cropped).jpg"><img class="mw-file-element" data-file-height="1061" data-file-width="965" decoding="async" height="275" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg/250px-Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg/500px-Ratan_Tata_2011_%28The_TCS_Story_Launch_-_CII%29_%28cropped%29.jpg 1.5x" width="250"/></a>

In [44]:
a_tags[0].get("href")

'/wiki/File:Ratan_Tata_2011_(The_TCS_Story_Launch_-_CII)_(cropped).jpg'

In [45]:
home_page = "https://en.wikipedia.org"
home_page

'https://en.wikipedia.org'

In [46]:
home_page + a_tags[0].get("href")

'https://en.wikipedia.org/wiki/File:Ratan_Tata_2011_(The_TCS_Story_Launch_-_CII)_(cropped).jpg'

In [47]:
img_links = [home_page + tag.get("href") for tag in a_tags]
img_links

['https://en.wikipedia.org/wiki/File:Ratan_Tata_2011_(The_TCS_Story_Launch_-_CII)_(cropped).jpg',
 'https://en.wikipedia.org/wiki/File:Shri_Ratan_Tata_meeting_the_Union_Minister_for_Commerce_%26_Industry_and_Textiles,_Shri_Anand_Sharma,_in_New_Delhi_on_December_22,_2011.jpg',
 'https://en.wikipedia.org/wiki/File:28india1.jpg',
 'https://en.wikipedia.org/wiki/File:The_Prime_Minister,_Shri_Narendra_Modi_releasing_the_Platinum_Jubilee_Milestone_book_on_Tata_Memorial_Centre,_in_New_Delhi_on_May_25,_2017._Shri_Ratan_Tata_and_other_dignitaries_are_also_seen.jpg',
 'https://en.wikipedia.org/wiki/File:The_TCS_Story_Launch_-_CII.jpg',
 'https://en.wikipedia.org/wiki/File:The_President,_Smt._Pratibha_Devisingh_Patil_presenting_the_Padma_Vibhushan_to_Shri_Ratan_Naval_Tata_at_Civil_Investiture-II_Ceremony,_at_Rashtrapati_Bhavan,_in_New_Delhi_on_May_10,_2008.jpg',
 'https://en.wikipedia.org/wiki/File:Commons-logo.svg',
 'https://en.wikipedia.org/wiki/File:Wikiquote-logo.svg',
 'https://en.wikipedia

## Get all tables

In [48]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
table = soup.find_all("table", class_="wikitable")
table[0]

<table class="wikitable" style="font-size:90%;">
<tbody><tr>
<th>Year</th>
<th>Name</th>
<th>Awarding organisation</th>
<th>Ref.
</th></tr>
<tr>
<td>2001
</td>
<td>Honorary <a href="/wiki/Doctor_of_Business_Administration" title="Doctor of Business Administration">Doctor of Business Administration</a>
</td>
<td><a href="/wiki/Ohio_State_University" title="Ohio State University">Ohio State University</a>
</td>
<td><sup class="reference" id="cite_ref-93"><a href="#cite_note-93"><span class="cite-bracket">[</span>92<span class="cite-bracket">]</span></a></sup>
</td></tr>
<tr>
<td rowspan="2">2004
</td>
<td><a href="/wiki/Medal_of_the_Oriental_Republic_of_Uruguay" title="Medal of the Oriental Republic of Uruguay">Medal of the Oriental Republic of Uruguay</a>
</td>
<td><a class="mw-redirect" href="/wiki/Government_of_Uruguay" title="Government of Uruguay">Government of Uruguay</a>
</td>
<td><sup class="reference" id="cite_ref-94"><a href="#cite_note-94"><span class="cite-bracket">[</span>93

In [50]:
import pandas as pd

In [52]:
df1 = pd.read_html(str(table[0]))[0].T
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
Year,2001,2004,2004,2005,2005,2006,2006,2007,2007,2008,...,2014,2014,2014,2015,2015,2016,2018,2022,2023,2023
Name,Honorary Doctor of Business Administration,Medal of the Oriental Republic of Uruguay,Honorary Doctor of Technology,International Distinguished Achievement Award,Honorary Doctor of Science,Honorary Doctor of Science,Responsible Capitalism Award,Honorary Fellowship,Carnegie Medal of Philanthropy,Honorary Doctor of Law,...,Sayaji Ratna Award,Honorary Knight Grand Cross of the Order of th...,Honorary Doctor of Laws,Honorary Doctor of Automotive Engineering,Sayaji Ratna Award,Commander of the Legion of Honour,Honorary Doctor in Engineering,Honorary Doctor of Literature,Honorary Officer of the Order of Australia (AO),Maharashtra Udyog Ratna
Awarding organisation,Ohio State University,Government of Uruguay,Asian Institute of Technology.,B'nai B'rith International,University of Warwick.,Indian Institute of Technology Madras,For Inspiration and Recognition of Science and...,The London School of Economics and Political S...,Carnegie Endowment for International Peace,University of Cambridge,...,Baroda Management Association,Queen Elizabeth II,"York University, Canada",Clemson University,"Baroda Management Association, Honoris Causa, ...",Government of France,Swansea University,HSNC University,King Charles III,Government of Maharashtra
Ref.,[92],[93],[94],[95],[96],[97],[98],[99],[100],[101],...,[127],[128][129],[130],[131],[132],[133],[134][135],[136][137],[138],[139]


In [53]:
dfs = []

for t in table:
    r = pd.read_html(str(t))[0]
    display(r)
    dfs.append(r)

Unnamed: 0,Year,Name,Awarding organisation,Ref.
0,2001,Honorary Doctor of Business Administration,Ohio State University,[92]
1,2004,Medal of the Oriental Republic of Uruguay,Government of Uruguay,[93]
2,2004,Honorary Doctor of Technology,Asian Institute of Technology.,[94]
3,2005,International Distinguished Achievement Award,B'nai B'rith International,[95]
4,2005,Honorary Doctor of Science,University of Warwick.,[96]
5,2006,Honorary Doctor of Science,Indian Institute of Technology Madras,[97]
6,2006,Responsible Capitalism Award,For Inspiration and Recognition of Science and...,[98]
7,2007,Honorary Fellowship,The London School of Economics and Political S...,[99]
8,2007,Carnegie Medal of Philanthropy,Carnegie Endowment for International Peace,[100]
9,2008,Honorary Doctor of Law,University of Cambridge,[101]


Unnamed: 0,Business positions,Business positions.1,Business positions.2
0,Preceded byJ. R. D. Tata,Chairman of Tata Group 1991–2012,Succeeded byCyrus Mistry
1,Preceded byCyrus Mistry,Chairman of Tata Group 2016–2017,Succeeded byNatarajan Chandrasekaran


In [54]:
len(dfs)

2

In [57]:
dfs[1]

Unnamed: 0,Business positions,Business positions.1,Business positions.2
0,Preceded byJ. R. D. Tata,Chairman of Tata Group 1991–2012,Succeeded byCyrus Mistry
1,Preceded byCyrus Mistry,Chairman of Tata Group 2016–2017,Succeeded byNatarajan Chandrasekaran


## Creating a class to scrap any wikipedia website

In [58]:
import requests
from bs4 import BeautifulSoup


class WikiScraper:

    def __init__(self, url: str):
        self.url = url
        self.home_page = "https://en.wikipedia.org"
        self.response = requests.get(url)
        self.response.raise_for_status()
        self.soup = BeautifulSoup(self.response.content)

    def get_title(self):
        title_tag = self.soup.find("title")
        return title_tag.text

    def get_h1(self):
        h1_tag = self.soup.find("h1", class_="firstHeading")
        return h1_tag.text

    def get_subheadings(self):
        sub_tags = self.soup.find_all("div", class_="mw-heading")
        sub_text = [tag.text for tag in sub_tags]
        return sub_text

    def get_paras(self):
        p_tags = self.soup.find_all("p")
        p_list = [tag.text for tag in p_tags]
        p_str = "\n\n".join(p_list)
        return p_str

    def get_image_links(self):
        a_tags = self.soup.find_all("a", class_="mw-file-description")
        img_links = [self.home_page + tag.get("href") for tag in a_tags]
        return img_links

    def get_all_tables(self):
        table_tags = self.soup.find_all("table", class_="wikitable")
        dfs = []
        for table in table_tags:
            r = pd.read_html(str(table))[0]
            display(r)
            dfs.append(r)
        # Return the dfs
        return dfs

In [59]:
url2 = "https://en.wikipedia.org/wiki/Data_science"
print(url2)

https://en.wikipedia.org/wiki/Data_science


In [67]:
s1 = WikiScraper(url=url2)
type(s1)

__main__.WikiScraper

In [68]:
s1.get_title()

'Data science - Wikipedia'

In [69]:
s1.get_h1()

'Data science'

In [70]:
s1.get_image_links()

['https://en.wikipedia.org/wiki/File:PIA23792-1600x1200(1).jpg',
 'https://en.wikipedia.org/wiki/File:EDA_example_-_Always_plot_your_data.jpg',
 'https://en.wikipedia.org/wiki/File:Data_Science.png',
 'https://en.wikipedia.org/wiki/File:Cloud_computing_in_enabling_data_science_at_scale.jpg']

In [71]:
url

'https://en.wikipedia.org/wiki/Ratan_Tata'

In [72]:
s2 = WikiScraper(url=url)
s2.get_all_tables()

Unnamed: 0,Year,Name,Awarding organisation,Ref.
0,2001,Honorary Doctor of Business Administration,Ohio State University,[92]
1,2004,Medal of the Oriental Republic of Uruguay,Government of Uruguay,[93]
2,2004,Honorary Doctor of Technology,Asian Institute of Technology.,[94]
3,2005,International Distinguished Achievement Award,B'nai B'rith International,[95]
4,2005,Honorary Doctor of Science,University of Warwick.,[96]
5,2006,Honorary Doctor of Science,Indian Institute of Technology Madras,[97]
6,2006,Responsible Capitalism Award,For Inspiration and Recognition of Science and...,[98]
7,2007,Honorary Fellowship,The London School of Economics and Political S...,[99]
8,2007,Carnegie Medal of Philanthropy,Carnegie Endowment for International Peace,[100]
9,2008,Honorary Doctor of Law,University of Cambridge,[101]


Unnamed: 0,Business positions,Business positions.1,Business positions.2
0,Preceded byJ. R. D. Tata,Chairman of Tata Group 1991–2012,Succeeded byCyrus Mistry
1,Preceded byCyrus Mistry,Chairman of Tata Group 2016–2017,Succeeded byNatarajan Chandrasekaran


[    Year                                               Name  \
 0   2001         Honorary Doctor of Business Administration   
 1   2004          Medal of the Oriental Republic of Uruguay   
 2   2004                      Honorary Doctor of Technology   
 3   2005      International Distinguished Achievement Award   
 4   2005                         Honorary Doctor of Science   
 5   2006                         Honorary Doctor of Science   
 6   2006                       Responsible Capitalism Award   
 7   2007                                Honorary Fellowship   
 8   2007                     Carnegie Medal of Philanthropy   
 9   2008                             Honorary Doctor of Law   
 10  2008                         Honorary Doctor of Science   
 11  2008                         Honorary Doctor of Science   
 12  2008                             Honorary Citizen Award   
 13  2008                                Honorary Fellowship   
 14  2008                          Inspi

In [74]:
urls = [
    "https://en.wikipedia.org/wiki/World_population",
    "https://en.wikipedia.org/wiki/Data_science",
    "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "https://en.wikipedia.org/wiki/Rust_(programming_language)",
    "https://en.wikipedia.org/wiki/Data_analysis",
]
urls

['https://en.wikipedia.org/wiki/World_population',
 'https://en.wikipedia.org/wiki/Data_science',
 'https://en.wikipedia.org/wiki/Python_(programming_language)',
 'https://en.wikipedia.org/wiki/Rust_(programming_language)',
 'https://en.wikipedia.org/wiki/Data_analysis']

In [75]:
d = {}

for i in urls:
    scraper = WikiScraper(url=i)
    print(f"Scraping : {scraper.url}")
    h1 = scraper.get_h1()
    img = scraper.get_image_links()
    d[h1] = img
    print(d[h1])
    print("=" * 60 + "\n")

# Printing final dictionary
print(d)

Scraping : https://en.wikipedia.org/wiki/World_population
['https://en.wikipedia.org/wiki/File:World_Population_Prospects.svg', 'https://en.wikipedia.org/wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png', 'https://en.wikipedia.org/wiki/File:2020_1million_cities.jpg', 'https://en.wikipedia.org/wiki/File:Expectancy_of_life.svg', 'https://en.wikipedia.org/wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg', 'https://en.wikipedia.org/wiki/File:Global_population_cartogram.png', 'https://en.wikipedia.org/wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png', 'https://en.wikipedia.org/wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg', 'https://en.wikipedia.org/wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg', 'https://en.wikipedia.org/wiki/File:World_population_(UN).svg', 'https://en.wikipedia.org/wiki/File:Total_Fertility_Rate_Map_by_Country.svg', 'https://en.wikipedia.org/wiki/F

In [76]:
d

{'World population': ['https://en.wikipedia.org/wiki/File:World_Population_Prospects.svg',
  'https://en.wikipedia.org/wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png',
  'https://en.wikipedia.org/wiki/File:2020_1million_cities.jpg',
  'https://en.wikipedia.org/wiki/File:Expectancy_of_life.svg',
  'https://en.wikipedia.org/wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg',
  'https://en.wikipedia.org/wiki/File:Global_population_cartogram.png',
  'https://en.wikipedia.org/wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png',
  'https://en.wikipedia.org/wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg',
  'https://en.wikipedia.org/wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg',
  'https://en.wikipedia.org/wiki/File:World_population_(UN).svg',
  'https://en.wikipedia.org/wiki/File:Total_Fertility_Rate_Map_by_Country.svg',
  'https://en.wikipedia.org/wiki/File:World_popul

In [77]:
import json

In [78]:
with open("links.json", "w") as f:
    json.dump(d, f, indent=4)