In [3]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [6]:
# Function to scrape IMDb's Top 100 Indian movies
def scrape_imdb_top_100_indian_movies(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract movie details
    movies_data = []
    for movie in soup.select('.lister-item-content'):
        # Check if rating is available
        rating_tag = movie.select_one('.ratings-imdb-rating strong')
        rating = float(rating_tag.get_text(strip=True)) if rating_tag else None
        
        # Check if year is available
        year_tag = movie.select_one('.lister-item-year')
        year_text = year_tag.get_text(strip=True).strip('()') if year_tag else None
        year = int(''.join(filter(str.isdigit, year_text))) if year_text else None
        
        name = movie.select_one('.lister-item-header a').get_text(strip=True)
        
        movies_data.append({'Name': name, 'Rating': rating, 'Year': year})

    # Create DataFrame
    movies_df = pd.DataFrame(movies_data)

    return movies_df

# Test the function
imdb_url = "https://www.imdb.com/list/ls056092300/"
movies_df = scrape_imdb_top_100_indian_movies(imdb_url)
print("IMDb's Top Rated 100 Indian Movies:")
print(movies_df)


IMDb's Top Rated 100 Indian Movies:
                                 Name Rating  Year
0                     Ship of Theseus   None  2012
1                              Iruvar   None  1997
2                     Kaagaz Ke Phool   None  1959
3   Lagaan: Once Upon a Time in India   None  2001
4                     Pather Panchali   None  1955
..                                ...    ...   ...
95                        Apur Sansar   None  1959
96                        Kanchivaram   None  2008
97                    Monsoon Wedding   None  2001
98                              Black   None  2005
99                            Deewaar   None  1975

[100 rows x 3 columns]


In [7]:
# Function to scrape product name, price, and discounts from Peachmode
def scrape_peachmode_products(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract product details
    products_data = []
    for product in soup.select('.product-grid-item'):
        name = product.select_one('.name').get_text(strip=True)
        price = product.select_one('.price').get_text(strip=True)
        discount = product.select_one('.discount').get_text(strip=True)
        products_data.append({'Product Name': name, 'Price': price, 'Discount': discount})

    # Create DataFrame
    products_df = pd.DataFrame(products_data)

    return products_df

# Test the function
peachmode_url = "https://peachmode.com/search?q=bags"
products_df = scrape_peachmode_products(peachmode_url)
print("Scraped Product Details from Peachmode:")
print(products_df)


Scraped Product Details from Peachmode:
Empty DataFrame
Columns: []
Index: []


In [8]:
# Function to scrape Top 10 ODI teams in men’s cricket along with matches, points, and rating
def scrape_odi_teams(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    teams_data = []
    for team in soup.select('.table-body'):
        name = team.select_one('.table-body__cell.rankings-table__team').get_text(strip=True)
        matches = team.select_one('.table-body__cell.u-center-text').get_text(strip=True)
        points = team.select('td')[3].get_text(strip=True)
        rating = team.select_one('.table-body__cell.u-text-right.rating').get_text(strip=True)
        teams_data.append({'Team': name, 'Matches': matches, 'Points': points, 'Rating': rating})
        if len(teams_data) >= 10:
            break

    teams_df = pd.DataFrame(teams_data)
    return teams_df

# Function to scrape Top 10 ODI Batsmen along with team and rating
def scrape_odi_batsmen(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    batsmen_data = []
    for player in soup.select('.table-body'):
        name = player.select_one('.table-body__cell.rankings-table__name a').get_text(strip=True)
        team = player.select_one('.table-body__cell.rankings-table__team').get_text(strip=True)
        rating = player.select_one('.table-body__cell.u-text-right.rating').get_text(strip=True)
        batsmen_data.append({'Batsman': name, 'Team': team, 'Rating': rating})
        if len(batsmen_data) >= 10:
            break

    batsmen_df = pd.DataFrame(batsmen_data)
    return batsmen_df

# Function to scrape Top 10 ODI bowlers along with team and rating
def scrape_odi_bowlers(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    bowlers_data = []
    for player in soup.select('.table-body'):
        name = player.select_one('.table-body__cell.rankings-table__name a').get_text(strip=True)
        team = player.select_one('.table-body__cell.rankings-table__team').get_text(strip=True)
        rating = player.select_one('.table-body__cell.u-text-right.rating').get_text(strip=True)
        bowlers_data.append({'Bowler': name, 'Team': team, 'Rating': rating})
        if len(bowlers_data) >= 10:
            break

    bowlers_df = pd.DataFrame(bowlers_data)
    return bowlers_df

# Test the functions
teams_url = "https://www.icc-cricket.com/rankings/mens/team-rankings/odi"
batsmen_url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting"
bowlers_url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling"

print("Top 10 ODI Teams:")
print(scrape_odi_teams(teams_url))
print("\nTop 10 ODI Batsmen:")
print(scrape_odi_batsmen(batsmen_url))
print("\nTop 10 ODI Bowlers:")
print(scrape_odi_bowlers(bowlers_url))


Top 10 ODI Teams:
Empty DataFrame
Columns: []
Index: []

Top 10 ODI Batsmen:
Empty DataFrame
Columns: []
Index: []

Top 10 ODI Bowlers:
Empty DataFrame
Columns: []
Index: []


In [9]:
def scrape_patreon_posts(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract post details
    posts_data = []
    for post in soup.select('.postItem'):
        # Extract heading
        heading = post.select_one('.postItem__title').get_text(strip=True)

        # Extract date
        date = post.select_one('.postItem__date').get_text(strip=True)

        # Extract content
        content = post.select_one('.postItem__content').get_text(strip=True)

        # Extract YouTube video link
        youtube_link = post.select_one('.postItem__media__content a[href*="youtube.com"]')
        youtube_video_likes = None
        if youtube_link:
            youtube_url = youtube_link['href']
            youtube_response = requests.get(youtube_url)
            youtube_soup = BeautifulSoup(youtube_response.text, 'html.parser')
            youtube_video_likes_element = youtube_soup.select_one('.like-button-renderer-like-button-unclicked span')
            if youtube_video_likes_element:
                youtube_video_likes = youtube_video_likes_element.text.strip()

        # Append data to list
        posts_data.append({'Heading': heading, 'Date': date, 'Content': content, 'YouTube Video Likes': youtube_video_likes})

    # Create DataFrame
    posts_df = pd.DataFrame(posts_data)

    return posts_df

# Test the function
patreon_url = "https://www.patreon.com/coreyms"
posts_df = scrape_patreon_posts(patreon_url)
print(posts_df)


Empty DataFrame
Columns: []
Index: []


In [10]:
def scrape_house_details(locality):
    # Base URL for searching house listings
    base_url = f"https://www.nobroker.in/property/sale/bangalore/{locality}"

    # Send request to the URL
    response = requests.get(base_url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract house details
    houses_data = []
    for house in soup.select('.card'):
        title = house.select_one('.card-title').get_text(strip=True)
        location = house.select_one('.card-title ~ .card-subtitle').get_text(strip=True)
        area = house.select_one('.card-text-area').get_text(strip=True)
        emi = house.select_one('.card-text-emidiv').get_text(strip=True)
        price = house.select_one('.card-price').get_text(strip=True)
        houses_data.append({'Title': title, 'Location': location, 'Area': area, 'EMI': emi, 'Price': price})

    # Create DataFrame
    houses_df = pd.DataFrame(houses_data)

    return houses_df

# Specify localities
localities = ['Indira-Nagar', 'Jayanagar', 'Rajaji-Nagar']

# Scrape house details for each locality
for locality in localities:
    print(f"Scraping house details for {locality}:")
    houses_df = scrape_house_details(locality)
    print(houses_df)
    print()


Scraping house details for Indira-Nagar:
Empty DataFrame
Columns: []
Index: []

Scraping house details for Jayanagar:
Empty DataFrame
Columns: []
Index: []

Scraping house details for Rajaji-Nagar:
Empty DataFrame
Columns: []
Index: []



In [11]:
def scrape_product_details(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract product details
    products_data = []
    for product in soup.select('.productCardWrapper')[:10]:  # Limit to first 10 products
        name = product.select_one('.productCardDetail').get_text(strip=True)
        price = product.select_one('.productPrice').get_text(strip=True)
        image_url = product.select_one('.productCardImg')['src']
        products_data.append({'Product Name': name, 'Price': price, 'Image URL': image_url})

    # Create DataFrame
    products_df = pd.DataFrame(products_data)

    return products_df

# URL of the page containing the list of bestsellers
url = "https://www.bewakoof.com/bestseller?sort=popular"

# Scrape product details
products_df = scrape_product_details(url)
print("Scraped Product Details:")
print(products_df)


Scraped Product Details:
Empty DataFrame
Columns: []
Index: []


In [14]:
def scrape_cnbc_world_news(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract news details
    news_data = []
    for article in soup.select('.Card-title'):
        # Extract heading
        heading = article.get_text(strip=True)

        # Extract date
        try:
            date_tag = article.find_previous(class_='Card-meta-time')
            date = date_tag.get_text(strip=True)
        except AttributeError:
            date = None

        # Extract news link
        news_link_tag = article.find_parent('a')
        news_link = news_link_tag['href'] if news_link_tag else None

        news_data.append({'Heading': heading, 'Date': date, 'News Link': news_link})

    # Create DataFrame
    news_df = pd.DataFrame(news_data)

    return news_df

# URL of the CNBC World news page
url = "https://www.cnbc.com/world/?region=world"

# Scrape news details
news_df = scrape_cnbc_world_news(url)
print("Scraped CNBC World News Details:")
print(news_df)


Scraped CNBC World News Details:
                                              Heading  Date News Link
0   Stocks making the biggest moves midday: GameSt...  None      None
1   A majority of investors believe a stock market...  None      None
2   S&P 500 rises as it tries to snap 3-day slide,...  None      None
3   Treasury yields slide as investors weigh econo...  None      None
4   JPMorgan says oil can rise to nearly $100 a ba...  None      None
5   Cathie Wood's ARK ETF is forming a bottoming p...  None      None
6   Citi says buy this chip stock ahead of its AI ...  None      None
7   Chip stocks are losing their mojo. How to trad...  None      None
8   These three stocks are on the verge of forming...  None      None
9   Russia claimed the West, Kyiv ordered the Mosc...  None      None
10  Russia says it's 'hard to believe' Islamic Sta...  None      None
11  Russia's intelligence chief claims U.S., U.K. ...  None      None
12  Putin expected to use deadly Moscow attack to ...  No

In [15]:
def scrape_downloaded_articles(url):
    # Send request to the URL
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article details
    articles_data = []
    for article in soup.select('.view-content .list-article'):
        # Extract paper title
        title = article.select_one('.list-article-title').get_text(strip=True)

        # Extract publication date
        date = article.select_one('.list-article-date').get_text(strip=True)

        # Extract author
        author = article.select_one('.list-article-author').get_text(strip=True)

        articles_data.append({'Paper Title': title, 'Date': date, 'Author': author})

    # Create DataFrame
    articles_df = pd.DataFrame(articles_data)

    return articles_df

# URL of the most downloaded articles page
url = "https://www.keaipublishing.com/en/journals/artificial-intelligence-in-agriculture/most-downloadedarticles/"

# Scrape article details
articles_df = scrape_downloaded_articles(url)
print("Scraped Most Downloaded Articles Details:")
print(articles_df)


Scraped Most Downloaded Articles Details:
Empty DataFrame
Columns: []
Index: []
