In [3]:
pwd

'C:\\Users\\Akash'

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def scrape_website(url, selector):
    """
    Basic web scraper function
    
    Parameters:
    url (str): The URL to scrape
    selector (str): CSS selector to target specific elements
    
    Returns:
    list: Extracted data
    """
    # Add headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Make the request
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return []
    
    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract data using the provided selector
    elements = soup.select(selector)
    
    # Extract text from elements
    results = [element.text.strip() for element in elements]
    
    return results

# Example usage
if __name__ == "__main__":
    # Example: Scraping book titles from books.toscrape.com
    url = "http://books.toscrape.com/"
    selector = "article.product_pod h3 a"
    
    # Scrape the data
    book_titles = scrape_website(url, selector)
    
    # Display results
    if book_titles:
        print(f"Found {len(book_titles)} books:")
        for i, title in enumerate(book_titles, 1):
            print(f"{i}. {title}")
        
        # Save to CSV
        df = pd.DataFrame(book_titles, columns=["Book Title"])
        df.to_csv("book_titles.csv", index=False)
        print("\nData saved to book_titles.csv")
    else:
        print("No data found.")

Found 20 books:
1. A Light in the ...
2. Tipping the Velvet
3. Soumission
4. Sharp Objects
5. Sapiens: A Brief History ...
6. The Requiem Red
7. The Dirty Little Secrets ...
8. The Coming Woman: A ...
9. The Boys in the ...
10. The Black Maria
11. Starving Hearts (Triangular Trade ...
12. Shakespeare's Sonnets
13. Set Me Free
14. Scott Pilgrim's Precious Little ...
15. Rip it Up and ...
16. Our Band Could Be ...
17. Olio
18. Mesaerion: The Best Science ...
19. Libertarianism for Beginners
20. It's Only the Himalayas

Data saved to book_titles.csv


In [10]:
import os

# Print the current working directory
print(f"Current working directory: {os.getcwd()}")

# List files in the current directory
print("\nFiles in current directory:")
for file in os.listdir():
    # Only show data files with common extensions
    if file.endswith(('.csv', '.xlsx', '.json', '.pkl', '.h5', '.parquet', '.db', '.sqlite')):
        print(f" - {file}")

# If you're using pandas, you might want to check its version instead
import pandas as pd
print(f"\nPandas version: {pd.__version__}")

# If you need to know where pandas might look for data, you can print the current directory
print(f"Data files would typically be read from: {os.getcwd()}")

Current working directory: C:\Users\Akash

Files in current directory:
 - book_titles.csv

Pandas version: 2.2.3
Data files would typically be read from: C:\Users\Akash


In [9]:
import os

# Print the current working directory
print(f"Current working directory: {os.getcwd()}")

# List files in the current directory
print("\nFiles in current directory:")
for file in os.listdir():
    # Only show data files with common extensions
    if file.endswith(('.csv', '.xlsx', '.json', '.pkl', '.h5', '.parquet', '.db', '.sqlite')):
        print(f" - {file}")

# If you're using pandas, you might want to check its version instead
import pandas as pd
print(f"\nPandas version: {pd.__version__}")

# If you need to know where pandas might look for data, you can print the working directory
print(f"Data files would typically be read from: {os.getcwd()}")

Current working directory: C:\Users\Akash

Files in current directory:
 - book_titles.csv

Pandas version: 2.2.3
Data files would typically be read from: C:\Users\Akash


In [None]:
# Install Beautiful Soup using pip
!pip install beautifulsoup4

# To verify the installation and show how to import it
import bs4
from bs4 import BeautifulSoup
print(f"Beautiful Soup version: {bs4.__version__}")
print("Beautiful Soup installed successfully!")

In [None]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Install required libraries if not already installed
try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    !pip install requests beautifulsoup4
    import requests
    from bs4 import BeautifulSoup

# Function to scrape a website
def scrape_website(url):
    # Send a request to the website
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Example: Extract all headlines (h1, h2, h3 tags)
        headlines = []
        for i in range(1, 4):
            for heading in soup.find_all(f'h{i}'):
                headlines.append({
                    'level': i,
                    'text': heading.text.strip()
                })
        
        # Example: Extract all links
        links = []
        for link in soup.find_all('a', href=True):
            links.append({
                'text': link.text.strip(),
                'url': link['href']
            })
        
        return {
            'title': soup.title.text if soup.title else 'No title found',
            'headlines': headlines,
            'links': links[:10]  # Limiting to first 10 links for brevity
        }
    
    except requests.exceptions.RequestException as e:
        return {'error': f"Request error: {e}"}
    except Exception as e:
        return {'error': f"Error: {e}"}

# Example usage
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"  # Example URL
print("Scraping website...")
result = scrape_website(url)

# Display results
print(f"\nTitle: {result.get('title')}")

print("\nSample Headlines:")
for headline in result.get('headlines', [])[:5]:  # Show first 5 headlines
    print(f"H{headline['level']}: {headline['text']}")

print("\nSample Links:")
for link in result.get('links', [])[:5]:  # Show first 5 links
    print(f"- {link['text']}: {link['url']}")

# Convert to DataFrame for easier analysis
headlines_df = pd.DataFrame(result.get('headlines', []))
links_df = pd.DataFrame(result.get('links', []))

print("\nHeadlines DataFrame Preview:")
if not headlines_df.empty:
    print(headlines_df.head())

print("\nLinks DataFrame Preview:")
if not links_df.empty:
    print(links_df.head())

In [7]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup

# Install required libraries if not already installed
try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    !pip install requests beautifulsoup4
    import requests
    from bs4 import BeautifulSoup

# Function to scrape a website
def scrape_website(url):
    # Send a request to the website
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Get the webpage content
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check if the request was successful
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get the page title
        title = soup.title.text if soup.title else "No title found"
        
        # Get all paragraph text
        paragraphs = [p.text.strip() for p in soup.find_all('p')]
        
        # Get all links
        links = [{'text': a.text.strip(), 'url': a['href']} 
                for a in soup.find_all('a', href=True) if a.text.strip()]
        
        return {
            'title': title,
            'paragraphs': paragraphs[:5],  # First 5 paragraphs
            'links': links[:10]  # First 10 links
        }
        
    except Exception as e:
        return {'error': str(e)}

# Example usage
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
print(f"Scraping {url}...")

# Get the data
data = scrape_website(url)

# Print the results
if 'error' in data:
    print(f"Error: {data['error']}")
else:
    print(f"\nTitle: {data['title']}")
    
    print("\nFirst few paragraphs:")
    for i, p in enumerate(data['paragraphs'], 1):
        if p:  # Only print non-empty paragraphs
            print(f"{i}. {p[:100]}...")  # Print first 100 chars of each paragraph
    
    print("\nSome links found:")
    for i, link in enumerate(data['links'], 1):
        print(f"{i}. {link['text']} -> {link['url']}")

Scraping https://en.wikipedia.org/wiki/Python_(programming_language)...

Title: Python (programming language) - Wikipedia

First few paragraphs:
2. Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code ...
3. Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming lan...
4. Python has gained widespread use in the machine learning community.[36][37][38][39] It is widely tau...
5. Python was conceived in the late 1980s[42] by Guido van Rossum at Centrum Wiskunde & Informatica (CW...

Some links found:
1. Jump to content -> #bodyContent
2. Main page -> /wiki/Main_Page
3. Contents -> /wiki/Wikipedia:Contents
4. Current events -> /wiki/Portal:Current_events
5. Random article -> /wiki/Special:Random
6. About Wikipedia -> /wiki/Wikipedia:About
7. Contact us -> //en.wikipedia.org/wiki/Wikipedia:Contact_us
8. Help -> /wiki/Help:Contents
9. Learn to edit -> /wiki/Help:Introduction
10. Community portal -> /wiki

In [6]:
# Simple web scraper using Beautiful Soup
import requests
from bs4 import BeautifulSoup

# Install required packages if not already installed
try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    !pip install requests beautifulsoup4

# URL to scrape
url = "https://www.example.com"

# Send HTTP request to the URL
print(f"Fetching content from {url}...")
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract and print the title
    title = soup.title.string
    print(f"\nPage Title: {title}")
    
    # Extract and print all headings
    print("\nHeadings:")
    for heading in soup.find_all(['h1', 'h2', 'h3']):
        print(f"- {heading.text.strip()}")
    
    # Extract and print all paragraphs
    print("\nParagraphs:")
    for paragraph in soup.find_all('p'):
        print(f"- {paragraph.text.strip()[:100]}...")  # Print first 100 chars
    
    # Extract and print all links
    print("\nLinks:")
    for link in soup.find_all('a', href=True):
        print(f"- {link.text.strip() or '[No text]'}: {link['href']}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

print("\nWeb scraping completed!")

Fetching content from https://www.example.com...

Page Title: Example Domain

Headings:
- Example Domain

Paragraphs:
- This domain is for use in documentation examples without needing permission. Avoid use in operations...
- Learn more...

Links:
- Learn more: https://iana.org/domains/example

Web scraping completed!
