In [2]:
import urllib3
from bs4 import BeautifulSoup as BS
import pandas as pd
import os

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up PoolManager with headers
http = urllib3.PoolManager()
http.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# Initialize dictionary to store news data
ndict = {
    'Title': [], "URL": [], "Date": [],
    "Author": [], "Author URL": [], "Content": [],
    "Category": [], "Description": []
}

# Define categories and their URLs
categories = {
    "business": "https://ekantipur.com/business/",
    "world": "https://ekantipur.com/world/",
    "sports": "https://ekantipur.com/sports/"
}

show = False  # Toggle to True if you want to print each entry

# Loop through each category
for category, url in categories.items():
    web_page = http.request('GET', url)
    soup = BS(web_page.data, 'html.parser')

    # Loop through all divs with '.normal' class
    for row in soup.select(".normal"):
        # Title is in an h2 element
        title = row.find("h2")

        # Skip if no title or link is found
        if title and title.a:
            # Get title text
            title_text = title.text.strip()
            
            # Extract the href attribute and construct full URL
            title_link = title.a.get("href")
            if title_link.split(":")[0] != "https":
                title_link = url.split(f"/{category}")[0] + title_link

            # Description is in a p element
            description = row.find("p")
            description_text = description.text.strip() if description else "No description available"

            # Request the individual news page
            news_page = http.request('GET', title_link)
            news_soup = BS(news_page.data, 'html.parser')

            # Find the date (with fallback)
            date_elem = news_soup.find("time")
            date = date_elem.text.strip() if date_elem else "Date not found"

            # Find the author URL and name (with fallback)
            author_elem = news_soup.select_one(".author")
            if author_elem and author_elem.a:
                author_url = author_elem.a.get("href")
                author_name = author_elem.text.strip()
            else:
                author_url = "Author URL not found"
                author_name = "Unknown Author"

            # Find the news content (with fallback)
            news_content = ""
            content_container = news_soup.select_one(".row")
            if content_container:
                for content in content_container.findAll("p"):
                    content_parts = str(content).split(">")
                    if len(content_parts) > 1:
                        content_text = content_parts[1].split("<")[0].strip()
                        if len(content_text) == 0:
                            break
                        else:
                            news_content += content_text + " "
                content = news_content.strip()
            else:
                content = "Content not found"

            # Append data to dictionary
            ndict["Title"].append(title_text)
            ndict["URL"].append(title_link)
            ndict["Date"].append(date)
            ndict["Author"].append(author_name)
            ndict["Author URL"].append(author_url)
            ndict["Content"].append(content)
            ndict["Category"].append(category)
            ndict["Description"].append(description_text)

            # Optional: Print the entry
            if show:
                print(f"""
                    Title: {title_text}, URL: {title_link}
                    Date: {date}, Author: {author_name}, Category: {category},
                    Author URL: {author_url},
                    Description: {description_text},
                    Content: {content}
                """)
        else:
            print(f"Skipping an entry in {category} due to missing title or link")

# Create DataFrame
ekantipur_df = pd.DataFrame(ndict, columns=list(ndict.keys()))

# Display the DataFrame head (optional)
print(ekantipur_df.head())

# Save each article to a unique .txt file in category-wise folders
base_dir = "ekantipur_news"  # Base directory to store category folders
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Loop through each category
for category in categories.keys():
    # Create category folder if it doesn't exist
    category_dir = os.path.join(base_dir, category)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    
    # Filter DataFrame for this category
    category_df = ekantipur_df[ekantipur_df["Category"] == category]
    
    # Save each article in a separate file
    for index, row in category_df.iterrows():
        # Create a safe filename from the title (replace invalid chars)
        safe_title = "".join(c if c.isalnum() or c in " _-" else "_" for c in row['Title'])[:50]
        file_name = f"{safe_title}_{index}.txt"  # Use index to ensure uniqueness
        file_path = os.path.join(category_dir, file_name)
        
        # Write to text file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(f"Title: {row['Title']}\n")
            f.write(f"URL: {row['URL']}\n")
            f.write(f"Date: {row['Date']}\n")
            f.write(f"Author: {row['Author']}\n")
            f.write(f"Author URL: {row['Author URL']}\n")
            f.write(f"Category: {row['Category']}\n")
            f.write(f"Description: {row['Description']}\n")
            f.write(f"Content: {row['Content']}\n")
        
        print(f"Saved article to {file_path}")

                                               Title  \
0  लुम्बिनी र पशुपति विकास कोषलाई गौरवको आयोजनाबा...   
1  साना र मझौला आयोजना संघीय सरकारको बजेटमा नराख्...   
2  उच्चस्तरीय आर्थिक सुधार सुझाव आयोगको प्रतिवेदन...   
3    अर्थतन्त्र सुधारका लागि प्रतिवेदन बुझाउँदै आयोग   
4  ‘आर्थिक विकासका लागि सरकार र निजी क्षेत्र सहका...   

                                                 URL            Date  \
0  https://ekantipur.com/business/2025/04/11/sugg...  Date not found   
1  https://ekantipur.com/business/2025/04/11/smal...  Date not found   
2  https://ekantipur.com/business/2025/04/11/the-...  Date not found   
3  https://ekantipur.com/business/2025/04/11/comm...  Date not found   
4  https://ekantipur.com/business/2025/04/11/gove...  Date not found   

                Author                                 Author URL  \
0  कान्तिपुर संवाददाता  https://ekantipur.com/author/author-14301   
1  कान्तिपुर संवाददाता  https://ekantipur.com/author/author-14301   
2  कान्तिपुर संवाददाता 