In [1]:
import os

# Change the current working directory
os.chdir('/Users/payalteyung/Desktop/DA')

# Verify the change
print(os.getcwd())

/Users/payalteyung/Desktop/DA


In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
def scrape_setopati_images(base_url, output_folder):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        articles = soup.find_all('div', class_='items')
        images = []

        for index, article in enumerate(articles):
            title_tag = article.find('span', class_='main-title')
            title = title_tag.text.strip() if title_tag else f"Untitled_{index}"
            sanitized_title = "_".join(title.split()).replace("/", "-")

            img_tag = article.find('img')
            if img_tag and 'src' in img_tag.attrs:
                img_url = urljoin(base_url, img_tag['src'])
                images.append({"title": title, "image_url": img_url})

                try:
                    img_response = requests.get(img_url, stream=True, headers=headers)
                    img_response.raise_for_status()

                    img_path = os.path.join(output_folder, f"{sanitized_title}_{index}.jpg")
                    with open(img_path, 'wb') as img_file:
                        for chunk in img_response.iter_content(1024):
                            img_file.write(chunk)

                    logging.info(f"Downloaded: {title}")
                except requests.exceptions.RequestException as e:
                    logging.error(f"Failed to download image for {title}: {e}")
            else:
                logging.warning(f"No image found for: {title}")

        
            time.sleep(1)

        return images

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return []

In [5]:
if __name__ == "__main__":
    base_url = "https://www.setopati.com/"  
    output_folder = "setopati_images"

    images_and_titles = scrape_setopati_images(base_url, output_folder)
    logging.info("Scraped images and titles:")
    for item in images_and_titles:
        logging.info(f"Title: {item['title']}, Image URL: {item['image_url']}")

2024-12-30 13:28:50,132 - INFO - Downloaded: नेप्सेले गर्‍यो सात कम्पनीको मूल्य समायोजन, नयाँ मूल्य कति?
2024-12-30 13:28:51,437 - INFO - Downloaded: 'नेपालमा पनि कोही बस्छ र!'
2024-12-30 13:28:52,754 - INFO - Downloaded: चीनले ब्रह्मपुत्रमा बनाउन लागेको विशाल जलविद्युत आयोजनाले के असर गर्छ नेपाललाई?
2024-12-30 13:28:55,084 - INFO - Downloaded: छोराहरूले लत्तो छाडेको व्यवसाय सम्हालेर वर्षकै २० लाख कमाइरहेको दम्पती
2024-12-30 13:28:56,396 - INFO - Downloaded: चालक र माली काम गर्ने आले दम्पतीले हुर्काएका चार सन्तान
2024-12-30 13:28:57,721 - INFO - Downloaded: रातको चिया व्यापार
2024-12-30 13:28:59,051 - INFO - Downloaded: अब विकट छैन ‘बोबाङ गाउँ’
2024-12-30 13:29:00,352 - INFO - Downloaded: कोरियामा सिकेर बेच्न थाले मासु, लगानी पुग्यो २ करोड
2024-12-30 13:29:01,675 - INFO - Downloaded: चौथो टेस्टमा भारतमाथि अस्ट्रेलियाको जित, सिरिजमा लियो अग्रता
2024-12-30 13:29:03,012 - INFO - Downloaded: भारत र अस्ट्रेलियाबीचको टेस्ट हेर्ने दर्शकको संख्याले बनायो कीर्तिमान
2024-12-30 13:29:04,320 - INF

In [17]:
def scrape_ratopati_images(base_url, output_folder):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        articles = soup.find_all('section', class_='breaking--special')
        images = []

        for index, article in enumerate(articles):
            title_tag = article.find('h2', class_='heading text--black')
            title = title_tag.text.strip() if title_tag else f"Untitled_{index}"
            sanitized_title = "_".join(title.split()).replace("/", "-")

            img_tag = article.find('img')
            if img_tag and 'src' in img_tag.attrs:
                img_url = urljoin(base_url, img_tag['src'])
                images.append({"title": title, "image_url": img_url})

                try:
                    img_response = requests.get(img_url, stream=True, headers=headers)
                    img_response.raise_for_status()

                    img_path = os.path.join(output_folder, f"{sanitized_title}_{index}.jpg")
                    with open(img_path, 'wb') as img_file:
                        for chunk in img_response.iter_content(1024):
                            img_file.write(chunk)

                    logging.info(f"Downloaded: {title}")
                except requests.exceptions.RequestException as e:
                    logging.error(f"Failed to download image for {title}: {e}")
            else:
                logging.warning(f"No image found for: {title}")

        
            time.sleep(1)

        return images

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return []

In [18]:
if __name__ == "__main__":
    base_url = "https://www.ratopati.com/"  
    output_folder = "ratopati_images"

    images_and_titles = scrape_ratopati_images(base_url, output_folder)
    print("Scraped images and titles:")
    for item in images_and_titles:
        print(f"Title: {item['title']}, Image URL: {item['image_url']}")

2024-12-30 14:29:08,170 - INFO - Downloaded: भुटानी शरणार्थी प्रकरणमा चलखेल गरेको आरोपमा केन्द्रीय कारागारका डीएसपी तानिए


Scraped images and titles:
Title: भुटानी शरणार्थी प्रकरणमा चलखेल गरेको आरोपमा केन्द्रीय कारागारका डीएसपी तानिए, Image URL: https://rpcdn.ratopati.com/media/ratopati.jpg


In [15]:
def scrape_rajdhani_images(base_url, output_folder):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        articles = soup.find_all('div', class_='elementor-post__text')
        images = []

        for index, article in enumerate(articles):
            title_tag = article.find('h3', class_='elementor-post__title')
            title = title_tag.text.strip() if title_tag else f"Untitled_{index}"
            sanitized_title = "_".join(title.split()).replace("/", "-")

            img_tag = article.find('img')
            if img_tag and 'src' in img_tag.attrs:
                img_url = urljoin(base_url, img_tag['src'])
                images.append({"title": title, "image_url": img_url})

                try:
                    img_response = requests.get(img_url, stream=True, headers=headers)
                    img_response.raise_for_status()

                    img_path = os.path.join(output_folder, f"{sanitized_title}_{index}.jpg")
                    with open(img_path, 'wb') as img_file:
                        for chunk in img_response.iter_content(1024):
                            img_file.write(chunk)

                    logging.info(f"Downloaded: {title}")
                except requests.exceptions.RequestException as e:
                    logging.error(f"Failed to download image for {title}: {e}")
            else:
                logging.warning(f"No image found for: {title}")

        
            time.sleep(1)

        return images

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return []

In [16]:
if __name__ == "__main__":
    base_url = "https://rajdhanidaily.com/"  
    output_folder = "rajdhani_images"

    images_and_titles = scrape_rajdhani_images(base_url, output_folder)
    print("Scraped images and titles:")
    for item in images_and_titles:
        print(f"Title: {item['title']}, Image URL: {item['image_url']}")

2024-12-30 14:23:05,804 - INFO - Downloaded: भ्रष्टचारबिरूद्ध खबरदारी गर्ने प्रचण्ड ओलीको गृह जिल्लामा
2024-12-30 14:23:07,120 - INFO - Downloaded: मधेशी दलहरुबीच सहकार्यका लागि जसपा नेपालसँग वार्ता
2024-12-30 14:23:08,693 - INFO - Downloaded: नगद सहयोगले बिग्रिए राउटे समुदाय !


Scraped images and titles:
Title: भ्रष्टचारबिरूद्ध खबरदारी गर्ने प्रचण्ड ओलीको गृह जिल्लामा, Image URL: https://rajdhanidaily.com/wp-content/uploads/2024/12/WhatsApp-Image-2024-12-23-at-17.02.13_21fd4df8.jpg-300x192.jpg
Title: मधेशी दलहरुबीच सहकार्यका लागि जसपा नेपालसँग वार्ता, Image URL: https://rajdhanidaily.com/wp-content/uploads/2024/12/09ea674e-7836-4fe9-baee-ed2a68043ede-1024x612.jpeg
Title: नगद सहयोगले बिग्रिए राउटे समुदाय !, Image URL: https://rajdhanidaily.com/wp-content/uploads/2024/12/2-raute-photo.jpg
