In [17]:
from bs4 import BeautifulSoup 
from datetime import datetime
#import logging
#from .utils import request_page_using_webdriver, Helper
#from urllib.request import urlopen
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd

In [28]:
class AlJazeeraScraper:
    base_url = "https://www.aljazeera.com/"

    def __init__(self, url=None):
        self._url = url
        self._data = ''
        self._soup = None
        # self.logger = logging.getLogger(__name__)
        self.driver = self.init_web_driver()

    def init_web_driver(self):
        path_to_chromedriver = '/Users/peizhi/chromedriver-mac-x64/chromedriver'
        options = Options()
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")

        service = Service(executable_path=path_to_chromedriver)
        driver = webdriver.Chrome(service=service)
        
        return driver
    
    def requestPageUsingWebDriver(self, link):
        self.driver.get(link)
        self.driver.implicitly_wait(50)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        return soup

    def fetch_page(self, url):
        try:
            self.driver.get(url)
            self.driver.implicitly_wait(10) 
            return self.driver.page_source
        except Exception as e:
            # self.logger.error(f"Error fetching page: {e}")
            print(f"Error fetching page: {e}")
            return None

    def extract_category_urls(self):
        page_content = self.fetch_page(self.base_url)
        if page_content:
            soup = BeautifulSoup(page_content, "html.parser")
            nav_tag = soup.find('nav', class_='site-header__navigation css-15ru6p1')
            if nav_tag:
                li_tags = nav_tag.find_all('li', class_='menu__item menu__item--aje')
                categories = {}
                for li in li_tags:
                    for a in li.find_all('a', href=True):
                        text = a.text.strip()
                        href = a['href'].strip()
                        if href.startswith('http'):
                            categories[text] = href
                        else:
                            categories[text] = f"{self.base_url.rstrip('/')}/{href.lstrip('/')}"
                return categories
        else:
            return {}

    def extract_latest_articles(self, category_url, max_articles=20):
        page_content = self.fetch_page(category_url)
        if page_content:
            soup = BeautifulSoup(page_content, "html.parser")
            result_links = []
            for a_tag in soup.find_all("a", class_='u-clickable-card__link'):
                if a_tag and "page" not in a_tag["href"] and "https" not in a_tag["href"]:
                    result_links.append(f'{self.base_url.rstrip("/")}/{a_tag["href"].lstrip("/")}')
                    if len(result_links) >= max_articles:
                        break

            return result_links
        else:
            return []
    

    def scrapeMultipleArticles(self, links):
        articles = []

        for link in links:
            article = self.scrapeArticle(link)
            if article:
                articles.append(article)

        return articles

    def scrapeArticle(self, link):
        try:
            self.driver.get(link)
            self.driver.implicitly_wait(10)
            parsed_page = BeautifulSoup(self.driver.page_source, "html.parser")

            article_data = {
                "title": "",
                "author": "Al Jazeera",
                "date": "",
                "image_url": "",
                "content": ""
            }

            try:
                header = parsed_page.find('header', class_='article-header')
                article_data["title"] = header.find('h1').text if header and header.find('h1') else "No Title"

                author_info = parsed_page.find('div', class_='article-author-name')
                article_data["author"] = author_info.find('a', class_='author-link').text if author_info and author_info.find('a', class_='author-link') else "Unknown"

                date_info = parsed_page.find('div', class_='article-dates')
                article_data["date"] = date_info.find('span').text.strip() if date_info and date_info.find('span') else "Unknown"

                figure = parsed_page.find('figure', class_='article-featured-image')
                article_data["image_url"] = figure.find('img')['src'] if figure and figure.find('img') else ""

                content_area = parsed_page.find('div', class_='wysiwyg')
                paragraphs = [para.text for para in content_area.find_all('p')] if content_area else []
                article_data["content"] = "\n".join(paragraphs)

            except Exception as e:
                print(f"Error parsing article: {e}")
                return None

        except Exception as e:
            print(f"Error scraping article: {e}")
            return None

        return article_data

    def close_driver(self):
        if self.driver:
            self.driver.quit()

In [29]:
# Categories of Aljazeera News 
scraper = AlJazeeraScraper()
categories = scraper.extract_category_urls() 
print(categories)

{'Middle East': 'https://www.aljazeera.com/middle-east/', 'Africa': 'https://www.aljazeera.com/africa/', 'Asia': 'https://www.aljazeera.com/asia/', 'US & Canada': 'https://www.aljazeera.com/us-canada/', 'Latin America': 'https://www.aljazeera.com/latin-america/', 'Europe': 'https://www.aljazeera.com/europe/', 'Asia Pacific': 'https://www.aljazeera.com/asia-pacific/', 'Israel-Gaza war': 'https://www.aljazeera.com/tag/israel-palestine-conflict/', 'Features': 'https://www.aljazeera.com/features/', 'Economy': 'https://www.aljazeera.com/economy/', 'Opinion': 'https://www.aljazeera.com/opinion/', 'Video': 'https://www.aljazeera.com/videos/', 'Ukraine war': 'https://www.aljazeera.com/tag/ukraine-russia-crisis/', 'Coronavirus': 'https://www.aljazeera.com/tag/coronavirus-pandemic/', 'Climate Crisis': 'https://www.aljazeera.com/climate-crisis', 'Investigations': 'https://www.aljazeera.com/investigations/', 'Interactives': 'https://www.aljazeera.com/interactives/', 'In Pictures': 'https://www.alj

In [30]:
# Create an instance of AlJazeeraScraper
scraper = AlJazeeraScraper()

try:
    # Extract category URLs
    categories = scraper.extract_category_urls()

    # Choose a category (for example, 'Economy')
    economy_category_url = categories.get('Economy')
    if economy_category_url:
        # Retrieve the latest articles from the Economy category
        latest_economy_articles = scraper.extract_latest_articles(economy_category_url)
        if latest_economy_articles:
            print("Latest Economy Articles:")
            for article in latest_economy_articles:
                print(article)
        else:
            print("No articles found in the Economy category.")
    else:
        print("Economy category URL not found")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the WebDriver
    scraper.close_driver()

Latest Economy Articles:
https://www.aljazeera.com/news/2023/12/15/boycotts-and-protests-how-are-people-around-the-world-defying-israel
https://www.aljazeera.com/economy/2023/12/15/japan-expands-sanctions-over-russias-war-in-ukraine
https://www.aljazeera.com/economy/2023/12/15/us-highlights-ai-as-risk-to-financial-system-for-first-time
https://www.aljazeera.com/features/2023/12/14/conned-exploited-trapped-romanias-new-flock-of-asian-delivery-riders
https://www.aljazeera.com/news/2023/12/13/tesla-recalls-nearly-all-us-vehicles-over-autopilot-system-defects
https://www.aljazeera.com/economy/2023/12/13/can-indigenous-inclusivity-be-the-key-to-successful-carbon-markets
https://www.aljazeera.com/news/2023/12/12/us-announces-hundreds-of-new-sanctions-in-push-to-isolate-russia
https://www.aljazeera.com/features/2023/12/12/how-a-childs-death-caused-an-ohio-city-to-turn-on-its-haitian-community
https://www.aljazeera.com/news/2023/12/12/tanker-attacked-by-cruise-missile-as-it-traverses-bab-el-ma

In [31]:
# Israel-Gaza war articles 
scraper = AlJazeeraScraper()
category_urls = scraper.extract_category_urls()

israel_gaza_category_url = category_urls.get("Israel-Gaza war", None)

if not israel_gaza_category_url:
    print("Israel-Gaza war category not found.")
else:
    print(f"Category URL for Israel-Gaza war: {israel_gaza_category_url}")

article_links = scraper.extract_latest_articles(israel_gaza_category_url)
for link in article_links:
    print(link)

Category URL for Israel-Gaza war: https://www.aljazeera.com/tag/israel-palestine-conflict/
https://www.aljazeera.com/news/2023/12/16/israeli-hostages-mistakenly-killed-in-gaza-were-had-a-white-flag-official
https://www.aljazeera.com/gallery/2023/12/16/photos-al-jazeera-journalist-samer-abudaqa-laid-to-rest
https://www.aljazeera.com/program/the-bottom-line/2023/12/16/john-mearsheimer-israel-is-choosing-apartheid-or-ethnic-cleansing
https://www.aljazeera.com/news/2023/12/16/stand-with-palestine-football-match-charity-no-place-for-genocide-israel-war-gaza-palestinians-children
https://www.aljazeera.com/news/2023/12/16/stand-with-palestine-football-match-charity-no-place-for-genocide-israel-war-gaza-palestinians-children
https://www.aljazeera.com/news/2023/12/16/israel-keeps-the-pressure-on-gaza-as-qatar-confirms-truce-talks
https://www.aljazeera.com/news/2023/12/16/al-jazeera-journalist-samer-abudaqa-laid-to-rest-in-southern-gaza
https://www.aljazeera.com/program/newsfeed/2023/12/16/momen

In [32]:
article_links = scraper.extract_latest_articles(israel_gaza_category_url)
articles_data = scraper.scrapeMultipleArticles(article_links)
df = pd.DataFrame(articles_data)

print(df)

                                                title                author  \
0   Israeli captives mistakenly killed in Gaza hel...               Unknown   
1                                            No Title               Unknown   
2   John Mearsheimer: Israel is choosing ‘aparthei...               Unknown   
3   ‘No place for genocide’: Qatar football fans s...            Hafsa Adil   
4   ‘No place for genocide’: Qatar football fans s...            Hafsa Adil   
5   Israel keeps the pressure on Gaza as Qatar con...               Unknown   
6   Al Jazeera journalist Samer Abudaqa laid to re...               Unknown   
7             Moment Israeli police attack journalist               Unknown   
8   Iran-aligned Houthis warn Israel, US against a...               Unknown   
9     Trauma and terror: What we are seeing from Gaza               Unknown   
10  Three brothers in the West Bank, kept apart by...          Ayman Nobani   
11       Israel-Hamas war: List of key events, day 7