In [1]:
import requests
import csv
from bs4 import BeautifulSoup

In [5]:
# function to fetch data from an url:-

def fetch_html_content(url):
    try:
        res = requests.get(url)
        res.raise_for_status()
        return res.content
    except requests.exceptions.RequestException as e:
        print("ERROR" , e)
        return None

In [6]:
# Example to check if our function is working properly or not :

url = "https://www.wikipedia.org/"

In [7]:
html_content = fetch_html_content(url)
print(html_content)

b'<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title>\n<meta name="description" content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.">\n<script>\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)no-js(\\s|$)/, "$1js-enabled$2" );\n</script>\n<meta name="viewport" content="initial-scale=1,user-scalable=yes">\n<link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png">\n<link rel="shortcut icon" href="/static/favicon/wikipedia.ico">\n<link rel="license" href="//creativecommons.org/licenses/by-sa/4.0/">\n<style>\n.sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-de847d1a.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg-MediaWiki-logo_sister

In [8]:
# function to parse html content using beautifulsoup:

def parse_html(html_content):
    try:
        Soup = BeautifulSoup(html_content , "html.parser")
        return Soup
    except Exception as e:
        print("ERROR" , e)
        return None

In [10]:
# example
parsed_html = parse_html(html_content)
print(parsed_html.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Wikipedia
  </title>
  <meta content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation." name="description"/>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)no-js(\s|$)/, "$1js-enabled$2" );
  </script>
  <meta content="initial-scale=1,user-scalable=yes" name="viewport"/>
  <link href="/static/apple-touch/wikipedia.png" rel="apple-touch-icon"/>
  <link href="/static/favicon/wikipedia.ico" rel="shortcut icon"/>
  <link href="//creativecommons.org/licenses/by-sa/4.0/" rel="license"/>
  <style>
   .sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-de847d1a.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg

In [13]:
# function to extract data:

def extract_data(Soup):
    data_dict = {}
    headings = Soup.find_all('h2')
    for heading in headings :
        heading_text = heading.get_text().strip()
        next_elements = heading.find_next_siblings()
        data = []
        for element in next_elements:
            if element.name == 'h2' :
                break
            data.append(element.get_text().strip())
        data_dict[heading_text] = '\n' .join(data)
    return data_dict


In [14]:
# example
extracted_data = extract_data(parsed_html)
print(extracted_data)

{'1,000,000+\n\n\narticles': 'Polski\nالعربية\nDeutsch\nEnglish\nEspañol\nFrançais\nItaliano\nمصرى\nNederlands\n日本語\nPortuguês\nРусский\nSinugboanong Binisaya\nSvenska\nУкраїнська\nTiếng Việt\nWinaray\n中文', '100,000+\n\n\narticles': 'Afrikaans\nAsturianu\nAzərbaycanca\nБългарски\n閩南語 / Bân-lâm-gú\nবাংলা\nБеларуская\nCatalà\nČeština\nCymraeg\nDansk\nEesti\nΕλληνικά\nEsperanto\nEuskara\nفارسی\nGalego\n한국어\nՀայերեն\nहिन्दी\nHrvatski\nBahasa Indonesia\nעברית\nქართული\nLadin\nLatina\nLatviešu\nLietuvių\nMagyar\nМакедонски\nBahasa Melayu\nBahaso Minangkabau\nမြန်မာဘာသာ\nNorskbokmålnynorsk\nНохчийн\nOʻzbekcha / Ўзбекча\nҚазақша / Qazaqşa / قازاقشا\nRomână\nSimple English\nSlovenčina\nSlovenščina\nСрпски / Srpski\nSrpskohrvatski / Српскохрватски\nSuomi\nதமிழ்\nТатарча / Tatarça\nภาษาไทย\nТоҷикӣ\nتۆرکجه\nTürkçe\nاردو\n粵語', '10,000+\n\n\narticles': 'Bahsa Acèh\nAlemannisch\nአማርኛ\nAragonés\nԱրեւմտահայերէն\nBahasa Hulontalo\nBasa Bali\nBahasa Banjar\nBasa Banyumasan\nБашҡортса\nБеларуская (тарашке

In [15]:
# function to write data into csv_file:

def write_to_csv(csv_filename , url , data_dict):
    try:
        with open(csv_filename , mode = 'w' , newline= '' , encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['ID' , 'URL' , 'LABEL' , 'TEXT'])
            for i, (label , text) in enumerate(data_dict.items(), start = 1):
                writer.writerow([i , url , label , text])
            print("DATA SAVED SUCCESSFULLY " , csv_filename)
    except Exceptions as e:
        print("ERROR WRITING THE DATA", e)
        

In [16]:
# example
write_to_csv("data.csv" , "url" , extracted_data)

DATA SAVED SUCCESSFULLY  data.csv


In [17]:
#Now main function for scraping data from wiki:

def scrape_wiki_page(url , csv_filename):
    html_content = fetch_html_content(url)
    if html_content :
        Soup = parse_html(html_content)
        if Soup :
            data_dict = extract_data(Soup)
            if data_dict:
               write_to_csv(csv_filename , url , data_dict)

In [18]:
url = 'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%B0%E0%A5%8D%E0%A4%AF%E0%A4%BE%E0%A4%B5%E0%A4%B0%E0%A4%A3'

In [19]:
csv_filename = "Task.csv"

In [20]:
scrape_wiki_page(url , csv_filename)

DATA SAVED SUCCESSFULLY  Task.csv
