In [None]:
### 1. **Basic HTTP GET Request**

import requests

response = requests.get('https://example.com')
# print(response)
print(response.text)

<Response [200]>
<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>


In [None]:
### 2. **Handling HTTP Response Status Codes**

import requests

response = requests.get('https://example.com')
if response.status_code == 200:
    print('Success!')
else:
    print('Failed to retrieve the page')



Success!


In [None]:
### 3. **Extracting Page Title with BeautifulSoup**

from bs4 import BeautifulSoup
import requests

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string
print(title)


Example Domain


In [None]:
### 4. **Finding All Links on a Page**

from bs4 import BeautifulSoup
import requests

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
for link in links:
    print(link.get('href'))


https://www.iana.org/domains/example


In [None]:
### 5. **Extracting Text from Paragraphs**

from bs4 import BeautifulSoup
import requests

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
    print(paragraph.text)


This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.
More information...


In [None]:
### 6. **Handling Relative URLs**

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

base_url = 'https://example.com'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
for link in links:
    absolute_url = urljoin(base_url, link.get('href'))
    print(absolute_url)



https://plus.google.com/104502282508811467249
Javascript:void(0)
Javascript:void(0)
https://cricbuzz.com/
https://cricbuzz.com/cricket-match/live-scores
https://cricbuzz.com/cricket-schedule/upcoming-series/international
https://cricbuzz.com/cricket-scorecard-archives
https://cricbuzz.com/cricket-news
https://cricbuzz.com/cricket-news
https://cricbuzz.com/cricket-news/editorial/cb-plus
https://cricbuzz.com/cricket-news/latest-news
https://cricbuzz.com/cricket-news/info/
https://cricbuzz.com/cricket-news/editorial/spotlight
https://cricbuzz.com/cricket-news/editorial/editorial-list
https://cricbuzz.com/cricket-news/editorial/specials
https://cricbuzz.com/cricket-news/editorial/stats-analysis
https://cricbuzz.com/cricket-news/editorial/interviews
https://cricbuzz.com/cricket-news/editorial/live-blogs
https://cricbuzz.com/cricket-news/experts/harsha-bhogle/170
https://cricbuzz.com/cricket-schedule/series/all
https://cricbuzz.com/cricket-series/8395/new-zealand-tour-of-india-2024
https://c

In [None]:
### 7. **Extracting Image URLs**

from bs4 import BeautifulSoup
import requests

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
for image in images:
    print(image.get('src'))

https://static.cricbuzz.com/images/cb_logo.svg
None
https://static.cricbuzz.com/a/img/v1/i1/c568995/match-image.jpg?d=high&p=det
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172406/perth-scorchers-women.jpg
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172408/sydney-thunder-women.jpg
https://static.cricbuzz.com/a/img/v1/i1/c572589/match-image.jpg?d=high&p=det
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172120/bangladesh.jpg
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172188/afghanistan.jpg
https://static.cricbuzz.com/a/img/v1/i1/c569017/match-image.jpg?d=high&p=det
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172196/new-south-wales.jpg
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172225/south-australia.jpg
https://static.cricbuzz.com/a/img/v1/i1/c379130/match-image.jpg?d=high&p=det
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172150/sri-lanka-a.jpg
https://static.cricbuzz.com/a/img/v1/25x18/i1/c172375/pakistan-a.jpg
https://static.cricbuzz.com/a/img/v1/i1/c570973/match-i

In [None]:
### 8. **Using Session Objects for Persistent Cookies**

import requests

session = requests.Session()
response = session.get('https://example.com')
print(response.cookies)


<RequestsCookieJar[]>


In [None]:
### 9. **Handling Redirects**

import requests

response = requests.get('https://example.com', allow_redirects=True)
print(response.url)

https://example.com/


In [None]:
### 10. **Handling Request Headers**

import requests

headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get('https://example.com', headers=headers)
print(response.text)


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

In [None]:
### 11. **Post Request with Form Data**

# import requests

# payload = {'username': 'user', 'password': 'pass'}
# response = requests.post('https://example.com/login', data=payload)
# print(response.text)





In [None]:
### 12. **Using BeautifulSoup to Parse HTML**

from bs4 import BeautifulSoup

html = '<html><head><title>Test</title></head><body><p>Hello World!</p></body></html>'
soup = BeautifulSoup(html, 'html.parser')
print(soup.p.text)

Hello World!


In [None]:
### 13. **Scraping Tables from a Web Page**

from bs4 import BeautifulSoup
import requests

# response = requests.get('https://example.com')
response = requests.get('https://www.w3schools.com/html/html_tables.asp')
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table')
for table in tables:
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        print([col.text for col in cols])


[]
['Alfreds Futterkiste', 'Maria Anders', 'Germany']
['Centro comercial Moctezuma', 'Francisco Chang', 'Mexico']
['Ernst Handel', 'Roland Mendel', 'Austria']
['Island Trading', 'Helen Bennett', 'UK']
['Laughing Bacchus Winecellars', 'Yoshi Tannamuri', 'Canada']
['Magazzini Alimentari Riuniti', 'Giovanni Rovelli', 'Italy']
[]
['<table>', 'Defines a table']
['<th>', 'Defines a header cell in a table']
['<tr>', 'Defines a row in a table']
['<td>', 'Defines a cell in a table']
['<caption>', 'Defines a table caption']
['<colgroup>', 'Specifies a group of one or more columns in a table for formatting']
['<col>', 'Specifies column properties for each column within a <colgroup> element']
['<thead>', 'Groups the header content in a table']
['<tbody>', 'Groups the body content in a table']
['<tfoot>', 'Groups the footer content in a table']


In [None]:
### 14. **Using Scrapy for Basic Web Crawling**

# Install Scrapy with: pip install scrapy
# Create a Scrapy project and a spider

!pip install scrapy

import scrapy

class ExampleSpider(scrapy.Spider):
    name = 'example'
    start_urls = ['https://example.com']

    def parse(self, response):
        for link in response.css('a::attr(href)').getall():
            yield {'URL': link}



In [None]:
### 15. **Handling Pagination with Scrapy**

import scrapy

class PaginationSpider(scrapy.Spider):
    name = 'pagination'
    start_urls = ['https://example.com/page/1']

    def parse(self, response):
        for article in response.css('article'):
            yield {
                'title': article.css('h2::text').get(),
                'link': article.css('a::attr(href)').get(),
            }
        next_page = response.css('a.next::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)


In [None]:
### 16. **Handling JavaScript-Rendered Content with Selenium**

from selenium import webdriver

driver = webdriver.Chrome()
driver.get('https://example.com')
print(driver.page_source)
driver.quit()

ModuleNotFoundError: No module named 'selenium'

In [None]:

### 17. **Extracting Data with XPath Using Selenium**

from selenium import webdriver

driver = webdriver.Chrome()
driver.get('https://example.com')
element = driver.find_element_by_xpath('//h1')
print(element.text)
driver.quit()

In [None]:
### 18. **Rate Limiting with Time Delays**

import requests
import time

urls = ['https://example.com/page1', 'https://example.com/page2']
for url in urls:
    response = requests.get(url)
    print(response.status_code)
    time.sleep(2)  # Delay to avoid hitting the server too quickly


In [None]:
### 19. **Handling Dynamic Content with Selenium**

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://example.com')
button = driver.find_element(By.ID, 'load-more')
button.click()
print(driver.page_source)
driver.quit()

In [None]:
### 20. **Extracting Data from JSON Responses**

import requests

response = requests.get('https://api.example.com/data')
data = response.json()
print(data)


In [None]:
### 21. **Handling Authentication with Requests**

from requests.auth import HTTPBasicAuth
import requests

response = requests.get('https://example.com/secure', auth=HTTPBasicAuth('user', 'pass'))
print(response.text)

In [None]:
### 22. **Extracting Metadata from a Web Page**

from bs4 import BeautifulSoup
import requests

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
metadata = soup.find_all('meta')
for meta in metadata:
    print(meta.get('name'), meta.get('content'))

In [None]:
### 23. **Scraping Data with Multiple Requests**

import requests
from bs4 import BeautifulSoup

urls = ['https://example.com/page1', 'https://example.com/page2']
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup.title.string)



In [None]:
### 24. **Extracting Data with Regex**

import requests
import re

response = requests.get('https://example.com')
matches = re.findall(r'<title>(.*?)</title>', response.text)
print(matches)

In [None]:
### 25. **Handling Cookies in Requests**

import requests

cookies = {'session_id': '123456'}
response = requests.get('https://example.com', cookies=cookies)
print(response.text)


In [None]:
### 26. **Storing Scraped Data in CSV**

import requests
import csv
from bs4 import BeautifulSoup

response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
data = [{'title': p.text} for p in soup.find_all('p')]

with open('data.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['title'])
    writer.writeheader()
    writer.writerows(data)

In [None]:
### 27. **Scraping Multiple Pages with Requests**

import requests
from bs4 import BeautifulSoup

for page in range(1, 4):
    response = requests.get(f'https://example.com/page/{page}')
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup.title.string)

In [None]:
### 28. **Using Proxies with Requests**

import requests

proxies = {'http': 'http://10.10.1.10:3128', 'https': 'http://10.10.1.10:1080'}
response = requests.get('https://example.com', proxies=proxies)
print(response.text)


In [None]:
### 29. **Handling Large Data with Streaming**

import requests

response = requests.get('https://example.com/largefile', stream=True)
with open('largefile', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

In [None]:
### 30. **Custom User-Agent Header**

import requests

headers = {'User-Agent': 'MyCrawler/1.0'}
response = requests.get('https://example.com', headers=headers)
print(response.text)