In [4]:
import requests
from bs4 import BeautifulSoup

# Function to scrape links, titles, and descriptions from a webpage
def scrape_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting links, titles, and descriptions
    extracted_links = []
    extracted_titles = []
    extracted_descriptions = []

    for anchor_tag in soup.find_all('a'):
        href = anchor_tag.get('href')
        if href and href.startswith('http'):
            extracted_links.append(href)
            title = anchor_tag.get_text(strip=True)
            extracted_titles.append(title)
            # Extracting description if available
            description = anchor_tag.find_next_sibling(string=True)
            if description and len(description.strip()) > 0:
                extracted_descriptions.append(description.strip())
            else:
                extracted_descriptions.append("")

    return extracted_links, extracted_titles, extracted_descriptions

# Example usage
dawn_links, dawn_titles, dawn_descriptions = scrape_webpage('https://www.dawn.com/')
bbc_links, bbc_titles, bbc_descriptions = scrape_webpage('https://www.bbc.com/')

# Print the extracted data
print("Dawn Links:")
print(dawn_links)
print("\nDawn Titles:")
print(dawn_titles)
print("\nDawn Descriptions:")
print(dawn_descriptions)

print(bbc_links)
print("\nBBC Titles:")
print(bbc_titles)
print("\nBBC Descriptions:")
print(bbc_descriptions)


Dawn Links:
['https://epaper.dawn.com', 'https://www.dawnnews.tv/watch-live', 'https://www.dawnnews.tv', 'https://images.dawn.com', 'https://herald.dawn.com', 'https://aurora.dawn.com', 'https://cityfm89.com', 'https://www.dawn.com/advertise', 'https://educationexpo.dawn.com', 'https://www.dawn.com/events/supplements', 'http://classifieds.dawn.com/', 'https://obituary.dawn.com', 'https://www.dawn.com/', 'https://epaper.dawn.com', 'https://www.dawn.com/trends/gaza-invasion', 'https://www.dawnrelief.com/', 'https://www.dawn.com/trends/gaza-invasion', 'https://www.dawnrelief.com/', 'https://www.dawn.com/live/gaza-invasion', 'https://www.dawn.com/live/gaza-invasion', 'https://www.dawn.com/news/1832948/uae-hits-out-at-netanyahu-for-saying-it-may-help-run-gaza', 'https://www.dawn.com/news/1833008/adoption-of-palestine-resolution-to-put-pressure-on-us-munir-akram', 'https://www.dawn.com/news/1833001/washington-says-use-of-its-weapons-by-tel-aviv-in-gaza-violated-law', 'https://www.dawn.com/ne

In [7]:
import re

# Function to clean and format text data
def clean_text(text):
    # Remove special characters and symbols
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    # Remove extra whitespaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# Reprocess extracted text data
cleaned_dawn_titles = [clean_text(title) for title in dawn_titles]
cleaned_dawn_descriptions = [clean_text(description) for description in dawn_descriptions]

cleaned_bbc_titles = [clean_text(title) for title in bbc_titles]
cleaned_bbc_descriptions = [clean_text(description) for description in bbc_descriptions]

# Print cleaned and formatted data
print("Cleaned Dawn Titles:")
print(cleaned_dawn_titles)
print("\nCleaned Dawn Descriptions:")
print(cleaned_dawn_descriptions)

print("\nCleaned BBC Titles:")
print(cleaned_bbc_titles)
print("\nCleaned BBC Descriptions:")
print(cleaned_bbc_descriptions)


Cleaned Dawn Titles:
['epaper', 'live tv', 'dawnnews urdu', 'images', 'herald', 'aurora', 'cityfm', 'advertise', 'events', 'supplements', 'classifieds', 'obituaries', 'dawncom', 'todays paper may', 'gaza siege', 'flood donations', 'gaza siege', 'flood donations', 'donors pledge over bn for gaza at kuwait conference', '', 'uae hits out at netanyahu for saying it may help run gaza', 'adoption of palestine resolution to put pressure on us munir akram', 'washington says use of its weapons by tel aviv in gaza violated law', 'situation in ajk calms down after days of violent clashes', '', 'heros welcome for hockey team in lahore after winning silver medal in azlan shah cup', '', 'after months of speculation miftah ismail confirms hes forming new political party', '', 'imf mission to meet pakistani authorities next week to discuss next phase of engagement official', '', 'no such thing as strategic stateowned enterprises says finance minister aurangzeb', '', 'fight till the last ball pcb chief

In [10]:
import csv

# Combine cleaned data with website name for Dawn and BBC
dawn_data_with_website = [('Dawn', title, description) for title, description in zip(cleaned_dawn_titles, cleaned_dawn_descriptions)]
bbc_data_with_website = [('BBC', title, description) for title, description in zip(cleaned_bbc_titles, cleaned_bbc_descriptions)]

# Combine data from both websites
combined_data = dawn_data_with_website + bbc_data_with_website

# Function to save data to a CSV file
def save_to_csv(filename, data):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Website', 'Title', 'Description'])
        writer.writerows(data)

# Save combined data to a single CSV file
save_to_csv('combined_data.csv', combined_data)

print("Combined data saved to CSV file successfully.")


Combined data saved to CSV file successfully.
