In [None]:
!pip install beautifulsoup4



In [None]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [None]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_
  # write functions to scrape from other websites

    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category
        # (entertainment, business, sports, science-technology, world)
        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)
    def get_geo_articles(self, max_pages=1):
        geo_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://urdu.geo.tv'
        categories = ['entertainment', 'business','sports','health','world']   # saqafat is entertainment category
                # (entertainment, business, sports, science-technology, world)

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/category/{category}"
                print(url)
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('div', class_='video-list laodMoreCatNews').find_all(class_='col-xs-6 col-sm-6 col-lg-6 col-md-6 singleBlock')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('li',class_='border-box')

                        # Article Title
                        headline = div.find('a')['title']
                        # Article link
                        link = card.find('a')['href']
                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('div',class_='content-area').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        geo_df['id'].append(self.id)
                        geo_df['title'].append(headline)
                        geo_df['link'].append(link)
                        geo_df['gold_label'].append(category.replace('health','science-technology'))
                        geo_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(geo_df)

    def get_dunya_articles(self, max_pages=1):
            dunya_df = {
                "id": [],
                "title": [],
                "link": [],
                "content": [],
                "gold_label": [],
            }
            base_url = 'https://urdu.dunyanews.tv/index.php/ur'
            categories = ['Entertainment', 'Business', 'Sports','Technology','World']   # saqafat is entertainment category
                    # (entertainment, business, sports, science-technology, world)

            # Iterating over the specified number of pages
            for category in categories:
                for page in range(1, max_pages + 1):
                    print(f"Scraping page {page} of category '{category}'...")
                    url = f"{base_url}/{category}"
                    response = requests.get(url)
                    response.raise_for_status()
                    soup = BeautifulSoup(response.text, "html.parser")

                    # Finding article cards
                    cards = soup.find('div', class_='newsBox categories').find_all(class_='cNewsBox')  # Adjust class as per actual site structure
                    print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                    success_count = 0

                    for card in cards:
                        try:
                            div = card.find('div',class_='col-md-8')

                            # Article Title
                            headline = div.find('a').get_text(strip=True)

                            # Article link
                            link = div.find('a')['href']
                            link = 'https://urdu.dunyanews.tv' + link
                            #link = f{'https://urdu.dunyanews.tv'}/{link}


                            # Requesting the content from each article's link
                            article_response = requests.get(link)
                            article_response.raise_for_status()
                            content_soup = BeautifulSoup(article_response.text, "html.parser")


                            # Content arranged in paras inside <span> tags
                            paras = content_soup.find('div',class_='main-news col-md-12').find_all('p')

                            combined_text = " ".join(
                            p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                            for p in paras if p.get_text(strip=True)
                            )

                            # Storing data
                            dunya_df['id'].append(self.id)
                            dunya_df['title'].append(headline)
                            dunya_df['link'].append(link)
                            dunya_df['gold_label'].append(category.replace('Technology','science-technology').replace('Entertainment','entertainment').replace('Business','business').replace('Sports','sports').replace('World','world'))
                                                # (entertainment, business, sports, science-technology, world)

                            dunya_df['content'].append(combined_text)

                            # Increment ID and success count
                            self.id += 1
                            success_count += 1

                        except Exception as e:
                            print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                    print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
                print('')

            return pd.DataFrame(dunya_df)




    def get_jang_articles(self, max_pages=1):
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://jang.com.pk/'
        categories = ['entertainment','business','sports', 'health-science', 'world']   # saqafat is entertainment category
                                    # (entertainment, business, sports, science-technology, world)

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/category/latest-news/{category}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='scrollPaginationNew__').find_all('li')  # Adjust class as per actual site structure
                cards = [card for card in cards if 'ad_latest_stories' not in card.get('class', [])]
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='main-heading')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each artnicle's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('div',class_='detail_view_content').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        jang_df['id'].append(self.id)
                        jang_df['title'].append(headline)
                        jang_df['link'].append(link)
                        jang_df['gold_label'].append(category.replace('health-science','science-technology'))
                        jang_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(jang_df)



In [None]:
scraper = NewsScraper()

In [None]:
express_df = scrapper.get_express_articles()
geo_df = scraper.get_geo_articles()
jang_df = scraper.get_jang_articles()
dunya_df = scrapper.get_dunya_articles()


Scraping page 1 of category 'entertainment'...
	--> Found 4 articles on page 1 of 'entertainment'.
	--> Successfully scraped 4 articles from page 1 of 'entertainment'.
Scraping page 2 of category 'entertainment'...
	--> Found 4 articles on page 2 of 'entertainment'.
	--> Successfully scraped 4 articles from page 2 of 'entertainment'.
Scraping page 3 of category 'entertainment'...
	--> Found 4 articles on page 3 of 'entertainment'.
	--> Successfully scraped 4 articles from page 3 of 'entertainment'.
Scraping page 4 of category 'entertainment'...
	--> Found 4 articles on page 4 of 'entertainment'.
	--> Successfully scraped 4 articles from page 4 of 'entertainment'.
Scraping page 5 of category 'entertainment'...
	--> Found 4 articles on page 5 of 'entertainment'.
	--> Successfully scraped 4 articles from page 5 of 'entertainment'.
Scraping page 6 of category 'entertainment'...
	--> Found 4 articles on page 6 of 'entertainment'.
	--> Successfully scraped 4 articles from page 6 of 'entertai

Unnamed: 0,id,title,link,content,gold_label
0,0,توشہ خانہ ون کیس: نیب کی عمران اور بشریٰ بی بی...,https://urdu.geo.tv/latest/386785-,اسلام آباد: نیب نے توشہ خانہ ون کیس میں عمران ...,entertainment
1,1,موٹروے پولیس نے تقریباً ساڑھے 8 لاکھ مالیت کا ...,https://urdu.geo.tv/latest/386792-,بہاولپور : ایم 5 پر موٹر وے پولیس نے 8 لاکھ 47...,entertainment
2,2,ملتان: ایک مریض سے ایڈز دوسرے مریضوں میں پھیلا...,https://urdu.geo.tv/latest/386788-,ملتان: نشتر اسپتال انتظامیہ نے تصدیق کی ہےکہڈ...,entertainment
3,3,چینی باشندوں کی سکیورٹی کیلئے پرعزم ہیں، افواہ...,https://urdu.geo.tv/latest/386776-,اسلام آباد: ترجمان دفتر خارجہ ممتاز زہرہ بلوچ...,entertainment
4,4,توشہ خانہ ون کیس: نیب کی عمران اور بشریٰ بی بی...,https://urdu.geo.tv/latest/386785-,اسلام آباد: نیب نے توشہ خانہ ون کیس میں عمران ...,entertainment
...,...,...,...,...,...
107,107,چینی باشندوں کی سکیورٹی کیلئے پرعزم ہیں، افواہ...,https://urdu.geo.tv/latest/386776-,اسلام آباد: ترجمان دفتر خارجہ ممتاز زہرہ بلوچ...,world
108,108,توشہ خانہ ون کیس: نیب کی عمران اور بشریٰ بی بی...,https://urdu.geo.tv/latest/386785-,اسلام آباد: نیب نے توشہ خانہ ون کیس میں عمران ...,world
109,109,موٹروے پولیس نے تقریباً ساڑھے 8 لاکھ مالیت کا ...,https://urdu.geo.tv/latest/386792-,بہاولپور : ایم 5 پر موٹر وے پولیس نے 8 لاکھ 47...,world
110,110,ملتان: ایک مریض سے ایڈز دوسرے مریضوں میں پھیلا...,https://urdu.geo.tv/latest/386788-,ملتان: نشتر اسپتال انتظامیہ نے تصدیق کی ہےکہڈ...,world


In [None]:
print(express_df.head(5))
print(geo_df.head(5))
print(jang_df.head(5))
print(dunya_df.head(5))

Scraping page 1 of category 'saqafat'...
	--> Found 10 articles on page 1 of 'saqafat'.
	--> Successfully scraped 10 articles from page 1 of 'saqafat'.
Scraping page 2 of category 'saqafat'...
	--> Found 10 articles on page 2 of 'saqafat'.
	--> Successfully scraped 10 articles from page 2 of 'saqafat'.
Scraping page 3 of category 'saqafat'...
	--> Found 10 articles on page 3 of 'saqafat'.
	--> Successfully scraped 10 articles from page 3 of 'saqafat'.
Scraping page 4 of category 'saqafat'...
	--> Found 10 articles on page 4 of 'saqafat'.
	--> Successfully scraped 10 articles from page 4 of 'saqafat'.
Scraping page 5 of category 'saqafat'...
	--> Found 10 articles on page 5 of 'saqafat'.
	--> Successfully scraped 10 articles from page 5 of 'saqafat'.
Scraping page 6 of category 'saqafat'...
	--> Found 10 articles on page 6 of 'saqafat'.
	--> Successfully scraped 10 articles from page 6 of 'saqafat'.
Scraping page 7 of category 'saqafat'...
	--> Found 10 articles on page 7 of 'saqafat'.


In [None]:
express_df

Unnamed: 0,id,title,link,content,gold_label
0,112,بالی ووڈ کی امیر ترین اداکارہ کون ہے؟,https://www.express.pk/story/2733127/who-is-th...,بھارتی فلم انڈسٹری کی مشہور اداکارہ اور متعدد ...,entertainment
1,113,ہیرے کی انگوٹھیاں اور رولیکس پاکستانی مداح کے ...,https://www.express.pk/story/2733122/diamond-r...,پاکستانی مداح نے پنجابی بھارتی گلوکار میکا سنگ...,entertainment
2,114,دو کمروں کے گھر میں رہتے تھے، سنجے کپور کا ماض...,https://www.express.pk/story/2733115/do-kamron...,بھارتی فلم انڈسٹری بالی ووڈ کے اداکار سنجے کپو...,entertainment
3,115,بالی ووڈ کا امیر ترین خاندان جو ماضی میں پھل ف...,https://www.express.pk/story/2733105/phal-faro...,بالی ووڈ کا سب سے امیر ترین خاندان کبھی پھل فر...,entertainment
4,116,امیشا پٹیل نے جیون ساتھی ڈھونڈ لیا؟ نئے تعلق ک...,https://www.express.pk/story/2733100/amisha-pa...,بالی ووڈ کی ’کہو نہ پیار ہے‘ فلم سے مشہور ہونے...,entertainment
...,...,...,...,...,...
345,457,ڈونلڈ ٹرمپ کے مبینہ قتل کا منصوبہ، امریکا میں ...,https://www.express.pk/story/2732232/onl-rmp-k...,مریکی کے محکمہ انصاف نے کہا ہے کہ منتخب صدر ڈو...,world
346,458,نائن الیون حملہ؛ امریکی عدالت نے خالد شیخ کو س...,https://www.express.pk/story/2732192/us-9th11-...,امریکا کی فوجی عدالت نے فیصلہ دیا ہے کہ نائن ا...,world
347,459,فٹبال میچ میں فلسطینی پرچم کی توہین پر تصادم؛ ...,https://www.express.pk/story/2732189/fbal-mich...,نیدر لینڈ میں ہونے والے فٹبال میچ میں اسٹیڈیم ...,world
348,460,مقبوضہ کشمیر میں بھارتی فوج کے میجر نے خودکشی ...,https://www.express.pk/story/2732183/indian-ar...,مقبوضہ کشمیر میں ذہنی دباؤ اور پست ہمتی کے با...,world
