<a href="https://colab.research.google.com/github/ShristiJoshi/College-website/blob/main/Newcode_Gorkhapatra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests>=2.31.0 \
    beautifulsoup4>=4.12.0 \
    pandas>=2.0.0 \
    openpyxl>=3.1.0 \
    lxml>=4.9.0 \
    html5lib>=1.1

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import re
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class EnhancedGorkhapatraScraper:
    def __init__(self):
        self.base_url = "https://gorkhapatraonline.com/"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.categories = {
            '‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§ø‡§Ø': 'national',
            '‡§≤‡•ã‡§ï‡§∏‡•á‡§µ‡§æ': 'public-service',
            '‡§∞‡§æ‡§ú‡§®‡•Ä‡§§‡§ø': 'politics',
            '‡§Ö‡§∞‡•ç‡§•': 'economy',
            '‡§µ‡§ø‡§ö‡§æ‡§∞': 'opinion',
            '‡§ñ‡•á‡§≤‡§ï‡•Å‡§¶': 'sports',
            '‡§Æ‡§®‡•ã‡§∞‡§û‡•ç‡§ú‡§®': 'entertainment',
            '‡§∏‡•ç‡§µ‡§æ‡§∏‡•ç‡§•‡•ç‡§Ø': 'health',
            '‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ': 'education',
            '‡§™‡•ç‡§∞‡§µ‡§ø‡§ß‡§ø': 'technology',
            '‡§Ö‡§®‡•ç‡§§‡§∞‡•ç‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§ø‡§Ø': 'international'
        }

    def get_page_content(self, url, retries=3):
        """Fetch page content with retry mechanism and better error handling"""
        for attempt in range(retries):
            try:
                response = self.session.get(url, timeout=20)
                response.raise_for_status()
                response.encoding = 'utf-8'
                logger.info(f"Successfully fetched: {url}")
                return response.text
            except requests.RequestException as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < retries - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    logger.error(f"Failed to fetch {url} after {retries} attempts")
                    return None
        return None

    def extract_article_links(self, html_content):
        """Extract article links using Gorkhapatra-specific selectors"""
        soup = BeautifulSoup(html_content, 'html.parser')
        article_links = []

        # Gorkhapatra-specific selectors based on website structure
        selectors = [
            # Main news headlines
            'h2 a', 'h3 a', 'h4 a',

            # News section links
            '.news-section a',
            '.article-section a',
            '.main-content a',

            # Specific to Gorkhapatra structure
            'a[href*="/news/"]',
            'a[href*="/article/"]',
            'a[href*="/story/"]',

            # Look for links in main content areas
            'main a',
            '.content-area a',
            '.news-area a'
        ]

        for selector in selectors:
            try:
                links = soup.select(selector)
                for link in links:
                    href = link.get('href')
                    if href and self.is_valid_article_url(href):
                        full_url = self.build_full_url(href)
                        if full_url and full_url not in article_links:
                            article_links.append(full_url)
            except Exception as e:
                logger.debug(f"Selector {selector} failed: {e}")
                continue

        # Remove duplicates and limit
        article_links = list(set(article_links))[:10]
        logger.info(f"Found {len(article_links)} potential article links")

        return article_links

    def is_valid_article_url(self, url):
        """Check if URL is a valid article URL for Gorkhapatra"""
        exclude_patterns = [
            '/category/', '/tag/', '/author/', '/page/', '/search',
            '/about', '/contact', '/privacy', '/terms', '/advertise',
            '/subscribe', '/login', '/register', '/admin', '/wp-admin',
            '/feed', '.pdf', '.jpg', '.png', '.gif', '.css', '.js',
            '#', 'javascript:', 'mailto:', 'tel:'
        ]

        url_lower = url.lower()
        for pattern in exclude_patterns:
            if pattern in url_lower:
                return False

        # Must be a relative URL or same domain
        if url.startswith('http') and 'gorkhapatraonline.com' not in url:
            return False

        return True

    def build_full_url(self, href):
        """Convert relative URLs to absolute URLs"""
        if href.startswith('http'):
            return href
        elif href.startswith('/'):
            return self.base_url.rstrip('/') + href
        else:
            return self.base_url + href

    def extract_article_data(self, article_url):
        """Extract comprehensive article data with Gorkhapatra-specific selectors"""
        html_content = self.get_page_content(article_url)
        if not html_content:
            return None

        soup = BeautifulSoup(html_content, 'html.parser')

        try:
            # Extract title
            title = self.extract_title(soup)

            # Extract publication date
            publication_date = self.extract_publication_date(soup)

            # Extract body content
            body_content = self.extract_body_content(soup)

            # Extract category
            category = self.extract_category(soup)

            # Extract author
            author = self.extract_author(soup)

            # Extract summary/excerpt
            summary = self.extract_summary(soup)

            # Extract tags
            tags = self.extract_tags(soup)

            # Extract image URLs
            images = self.extract_images(soup)

            # Validate data
            if title and body_content and len(body_content) > 50:
                return {
                    'Title': title,
                    'Publication_Date': publication_date,
                    'Author': author,
                    'Category': category,
                    'Summary': summary,
                    'Body_Content': body_content,
                    'Tags': tags,
                    'Images': images,
                    'Article_URL': article_url,
                    'Scraped_At': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
            else:
                logger.warning(f"Insufficient data extracted from {article_url}")
                return None

        except Exception as e:
            logger.error(f"Error extracting data from {article_url}: {e}")
            return None

    def extract_title(self, soup):
        """Extract article title with multiple fallback selectors"""
        title_selectors = [
            'h1.article-title',
            'h1.news-title',
            'h1',
            '.article-title h1',
            '.news-title h1',
            '.headline h1',
            'title'
        ]

        for selector in title_selectors:
            elem = soup.select_one(selector)
            if elem:
                title = elem.get_text(strip=True)
                if title and len(title) > 5:
                    return title
        return "N/A"

    def extract_publication_date(self, soup):
        """Extract publication date with multiple formats"""
        date_selectors = [
            '.published-date',
            '.article-date',
            '.news-date',
            '.meta-date',
            '.timestamp',
            'time',
            '.date'
        ]

        for selector in date_selectors:
            elem = soup.select_one(selector)
            if elem:
                text = elem.get_text(strip=True)
                # Try to extract date from text
                date = self.parse_date_text(text)
                if date:
                    return date
                return text

        return "N/A"

    def parse_date_text(self, text):
        """Parse various date formats"""
        # Nepali date patterns
        nepali_patterns = [
            r'(\d{1,2}\s+[‡§≠‡§¶‡•å|‡§Æ‡§Ç‡§∏‡§ø‡§∞|‡§™‡•Å‡§∑|‡§Æ‡§æ‡§ò|‡§´‡§æ‡§≤‡•ç‡§ó‡•Å‡§®|‡§ö‡•à‡§§|‡§¨‡•à‡§∂‡§æ‡§ñ|‡§ú‡•á‡§†|‡§Ö‡§∏‡§æ‡§∞|‡§∂‡•ç‡§∞‡§æ‡§µ‡§£|‡§≠‡§¶‡•å|‡§Ö‡§∏‡•ã‡§ú|‡§ï‡§æ‡§∞‡•ç‡§§‡§ø‡§ï|‡§Æ‡§Ç‡§∏‡§ø‡§∞]+)\s+(\d{4})',
            r'(\d{1,2}\s+[‡§≠‡§¶‡•å|‡§Æ‡§Ç‡§∏‡§ø‡§∞|‡§™‡•Å‡§∑|‡§Æ‡§æ‡§ò|‡§´‡§æ‡§≤‡•ç‡§ó‡•Å‡§®|‡§ö‡•à‡§§|‡§¨‡•à‡§∂‡§æ‡§ñ|‡§ú‡•á‡§†|‡§Ö‡§∏‡§æ‡§∞|‡§∂‡•ç‡§∞‡§æ‡§µ‡§£|‡§≠‡§¶‡•å|‡§Ö‡§∏‡•ã‡§ú|‡§ï‡§æ‡§∞‡•ç‡§§‡§ø‡§ï|‡§Æ‡§Ç‡§∏‡§ø‡§∞]+)',
        ]

        # English date patterns
        english_patterns = [
            r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})',
            r'(\d{1,2}[-/]\d{1,2}[-/]\d{4})',
            r'(\w+\s+\d{1,2},?\s+\d{4})',
            r'(\d{1,2}\s+\w+\s+\d{4})'
        ]

        for pattern in nepali_patterns + english_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(0)

        return None

    def extract_body_content(self, soup):
        """Extract article body content"""
        content_selectors = [
            '.article-body',
            '.news-body',
            '.story-body',
            '.post-body',
            '.entry-content',
            '.content',
            '.article-content',
            '.story-content',
            'article .content',
            '.main-content'
        ]

        for selector in content_selectors:
            container = soup.select_one(selector)
            if container:
                paragraphs = container.find_all('p')
                if paragraphs:
                    content_list = []
                    for p in paragraphs:
                        text = p.get_text(strip=True)
                        if text and len(text) > 20:
                            content_list.append(text)

                    if content_list:
                        return ' '.join(content_list)

        # Fallback: try to get all paragraphs
        all_paragraphs = soup.find_all('p')
        content_list = []
        for p in all_paragraphs:
            text = p.get_text(strip=True)
            if text and len(text) > 30:
                content_list.append(text)

        return ' '.join(content_list) if content_list else "N/A"

    def extract_category(self, soup):
        """Extract article category"""
        category_selectors = [
            '.category',
            '.article-category',
            '.news-category',
            '.breadcrumb a',
            '.breadcrumbs a',
            '.meta-category',
            '.tag'
        ]

        for selector in category_selectors:
            elem = soup.select_one(selector)
            if elem:
                category = elem.get_text(strip=True)
                if category and len(category) > 2:
                    return category

        return "N/A"

    def extract_author(self, soup):
        """Extract article author"""
        author_selectors = [
            '.author',
            '.article-author',
            '.news-author',
            '.byline',
            '.meta-author',
            '.writer'
        ]

        for selector in author_selectors:
            elem = soup.select_one(selector)
            if elem:
                author = elem.get_text(strip=True)
                if author and len(author) > 2:
                    return author

        return "N/A"

    def extract_summary(self, soup):
        """Extract article summary/excerpt"""
        summary_selectors = [
            '.summary',
            '.excerpt',
            '.article-summary',
            '.news-summary',
            '.description',
            '.meta-description'
        ]

        for selector in summary_selectors:
            elem = soup.select_one(selector)
            if elem:
                summary = elem.get_text(strip=True)
                if summary and len(summary) > 20:
                    return summary

        return "N/A"

    def extract_tags(self, soup):
        """Extract article tags"""
        tag_selectors = [
            '.tags a',
            '.tag a',
            '.article-tags a',
            '.news-tags a',
            '.meta-tags a'
        ]

        tags = []
        for selector in tag_selectors:
            elements = soup.select(selector)
            for elem in elements:
                tag = elem.get_text(strip=True)
                if tag and len(tag) > 2:
                    tags.append(tag)

        return ', '.join(tags) if tags else "N/A"

    def extract_images(self, soup):
        """Extract article images"""
        image_selectors = [
            '.article-image img',
            '.news-image img',
            '.story-image img',
            '.post-image img',
            '.content img',
            'article img'
        ]

        images = []
        for selector in image_selectors:
            elements = soup.select(selector)
            for elem in elements:
                src = elem.get('src')
                if src:
                    if src.startswith('/'):
                        src = self.base_url.rstrip('/') + src
                    elif not src.startswith('http'):
                        src = self.base_url + src
                    images.append(src)

        return ', '.join(images) if images else "N/A"

    def scrape_articles(self, max_articles=20):
        """Scrape articles with enhanced error handling and progress tracking"""
        logger.info(f"Starting enhanced scraping of {self.base_url}")

        main_page = self.get_page_content(self.base_url)
        if not main_page:
            logger.error("Failed to fetch main page")
            return []

        article_links = self.extract_article_links(main_page)
        if not article_links:
            logger.warning("No article links found")
            return []

        articles_data = []
        successful_scrapes = 0
        failed_scrapes = 0

        logger.info(f"Starting to scrape {len(article_links)} articles...")

        for i, link in enumerate(article_links, 1):
            if len(articles_data) >= max_articles:
                break

            logger.info(f"Scraping article {i}/{len(article_links)}: {link}")

            try:
                data = self.extract_article_data(link)
                if data:
                    articles_data.append(data)
                    successful_scrapes += 1
                    logger.info(f"‚úÖ Successfully scraped: {data['Title'][:60]}...")
                else:
                    failed_scrapes += 1
                    logger.warning(f"‚ùå Failed to extract data from: {link}")
            except Exception as e:
                failed_scrapes += 1
                logger.error(f"‚ùå Error scraping {link}: {e}")

            # Polite delay between requests
            time.sleep(2)

        logger.info(f"Scraping completed: {successful_scrapes} successful, {failed_scrapes} failed")
        return articles_data

    def save_to_excel(self, articles_data, filename=None):
        """Save scraped data to Excel with enhanced formatting"""
        if not articles_data:
            logger.warning("No data to save")
            return None

        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"enhanced_gorkhapatra_news_{timestamp}.xlsx"

        try:
            df = pd.DataFrame(articles_data)

            # Reorder columns for better readability
            column_order = [
                'Title', 'Publication_Date', 'Author', 'Category',
                'Summary', 'Body_Content', 'Tags', 'Images',
                'Article_URL', 'Scraped_At'
            ]

            # Only include columns that exist in the data
            existing_columns = [col for col in column_order if col in df.columns]
            df = df[existing_columns]

            # Save to Excel with better formatting
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='News_Data', index=False)

                # Auto-adjust column widths
                worksheet = writer.sheets['News_Data']
                for column in worksheet.columns:
                    max_length = 0
                    column_letter = column[0].column_letter
                    for cell in column:
                        try:
                            if len(str(cell.value)) > max_length:
                                max_length = len(str(cell.value))
                        except:
                            pass
                    adjusted_width = min(max_length + 2, 100)
                    worksheet.column_dimensions[column_letter].width = adjusted_width

            logger.info(f"Data saved to {filename}")
            return filename

        except Exception as e:
            logger.error(f"Error saving to Excel: {e}")
            return None

    def get_scraping_stats(self, articles_data):
        """Get statistics about scraped data"""
        if not articles_data:
            return {}

        stats = {
            'total_articles': len(articles_data),
            'categories': {},
            'authors': {},
            'date_range': {'earliest': None, 'latest': None}
        }

        for article in articles_data:
            # Count categories
            category = article.get('Category', 'Unknown')
            stats['categories'][category] = stats['categories'].get(category, 0) + 1

            # Count authors
            author = article.get('Author', 'Unknown')
            stats['authors'][author] = stats['authors'].get(author, 0) + 1

        return stats


def main():
    """Main execution function with enhanced error handling and user feedback"""
    try:
        print("üöÄ Starting Enhanced Gorkhapatra News Scraper...")
        print("=" * 60)

        scraper = EnhancedGorkhapatraScraper()

        # Get user preference for number of articles
        try:
            max_articles = int(input("Enter maximum number of articles to scrape (default 20): ") or "20")
        except ValueError:
            max_articles = 20
            print("Invalid input, using default: 20 articles")

        print(f"\nüì∞ Scraping up to {max_articles} articles from Gorkhapatra Online...")
        print("‚è≥ This may take a few minutes...\n")

        # Start scraping
        start_time = time.time()
        articles = scraper.scrape_articles(max_articles=max_articles)
        end_time = time.time()

        if articles:
            # Save data
            filename = scraper.save_to_excel(articles)

            # Get statistics
            stats = scraper.get_scraping_stats(articles)

            # Display results
            print("\n" + "=" * 60)
            print("‚úÖ SCRAPING COMPLETED SUCCESSFULLY!")
            print("=" * 60)
            print(f"üìä Total Articles Scraped: {len(articles)}")
            print(f"‚è±Ô∏è  Time Taken: {end_time - start_time:.2f} seconds")
            print(f"üìÅ Data Saved To: {filename}")

            print(f"\nüìà Category Distribution:")
            for category, count in stats['categories'].items():
                print(f"   {category}: {count}")

            print(f"\nüë• Author Distribution:")
            for author, count in stats['authors'].items():
                if count > 1:  # Only show authors with multiple articles
                    print(f"   {author}: {count}")

            print(f"\nüì∞ Sample Articles:")
            for i, article in enumerate(articles[:3], 1):
                print(f"\nArticle {i}:")
                print(f"  üìù Title: {article['Title'][:80]}...")
                print(f"  üìÖ Date: {article['Publication_Date']}")
                print(f"  üè∑Ô∏è  Category: {article['Category']}")
                print(f"  ‚úçÔ∏è  Author: {article['Author']}")
                print(f"  üìÑ Content: {article['Body_Content'][:100]}...")

        else:
            print("\n‚ùå No articles were scraped successfully")
            print("üí° Possible issues:")
            print("   - Website structure may have changed")
            print("   - Network connection problems")
            print("   - Website blocking automated requests")

    except KeyboardInterrupt:
        print("\n‚ö†Ô∏è Scraping interrupted by user")
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        print(f"\n‚ùå An unexpected error occurred: {e}")
        print("üí° Check the logs for more details")


if __name__ == "__main__":
    main()


üöÄ Starting Enhanced Gorkhapatra News Scraper...
Enter maximum number of articles to scrape (default 20): 20

üì∞ Scraping up to 20 articles from Gorkhapatra Online...
‚è≥ This may take a few minutes...


‚úÖ SCRAPING COMPLETED SUCCESSFULLY!
üìä Total Articles Scraped: 10
‚è±Ô∏è  Time Taken: 42.79 seconds
üìÅ Data Saved To: enhanced_gorkhapatra_news_20251204_062842.xlsx

üìà Category Distribution:
   N/A: 10

üë• Author Distribution:
   N/A: 10

üì∞ Sample Articles:

Article 1:
  üìù Title: ‡§µ‡§ø‡§∑‡§æ‡§¶‡•Ä ‡§®‡§ø‡§Ø‡§®‡•ç‡§§‡•ç‡§∞‡§£‡§ï‡•ã ‡§Ü‡§¶‡•á‡§∂...
  üìÖ Date: N/A
  üè∑Ô∏è  Category: N/A
  ‚úçÔ∏è  Author: N/A
  üìÑ Content: ‡§§‡§∞‡§ï‡§æ‡§∞‡•Ä ‡§§‡§•‡§æ ‡§´‡§≤‡§´‡•Ç‡§≤‡§Æ‡§æ ‡§π‡§æ‡§≤‡§ø‡§è‡§ï‡•ã ‡§µ‡§ø‡§∑ ‡§Æ‡§æ‡§®‡§ø‡§∏‡§ï‡•ã ‡§∂‡§∞‡•Ä‡§∞‡§Æ‡§æ ‡§Æ‡§®‡•ç‡§¶ ‡§Ö‡§∏‡§∞ ‡§ó‡§∞‡•ç‡§õ ‡•§ ‡§Ø‡§∏‡•ç‡§§‡•ã ‡§µ‡§ø‡§∑‡§≤‡•á ‡§§‡§§‡•ç‡§ï‡§æ‡§≤ ‡§Ö‡§∏‡§∞ ‡§®‡§ó‡§∞‡•á ‡§™‡§®‡§ø ‡§¨‡§ø‡§∏‡•ç‡§§‡§æ...

Article 2:
  üìù Title: ‡§Æ‡•É‡§§ ‡§Ö‡§µ‡§∏‡•ç‡§•‡§æ‡§Æ‡§æ ‡§≠‡•á‡§ü‡§ø‡§è

In [None]:
from google.colab import files
files.download("enhanced_gorkhapatra_news_20250829_170333.xlsx")

FileNotFoundError: Cannot find file: enhanced_gorkhapatra_news_20250829_170333.xlsx