In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# Path to your chromedriver
driver = webdriver.Chrome()
# Fetch the page
driver.get("https://www.moroccoworldnews.com/morocco-news")

# Define the regular expression pattern for URLs ending with numbers
url_pattern = re.compile(r'https://www\.moroccoworldnews\.com/morocco-news/\d{1,4}$')

# Extract URLs
try:
    # Wait for the page to load and find all <a> tags
    a_tags = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
    )
    
    # Filter URLs using the regular expression
    urls = [a_tag.get_attribute('href') for a_tag in a_tags 
            if a_tag.get_attribute('href') and url_pattern.match(a_tag.get_attribute('href'))]
    
    # Save URLs to a text file
    with open('extracted_urls.txt', 'w') as file:
        for url in urls:
            file.write(url + '\n')
    
    print(f"Saved {len(urls)} URLs to 'extracted_urls.txt'")

except Exception as e:
    print(f"Error: {e}")

# Close the browser
driver.quit()


Saved 2608 URLs to 'extracted_urls.txt'


In [41]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome WebDriver
driver = webdriver.Chrome()

# Fetch the page
driver.get("https://en.hespress.com/?s=events+in+morocco")

# Read URLs from the text file
def read_urls(file_path):
    with open(file_path, 'r') as file:
        urls = [line.strip() for line in file]
    return urls

# Extract data from each URL
def extract_data(urls, max_urls=500):
    results = []
    for url in urls[:max_urls]:
        print(f"Processing URL: {url}")  # Print the URL being processed
        driver.get(url)
        try:
            # Extract post title
            title_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h1.post-title"))
            )
            title = title_element.text
            
            # Extract category
            category_elements = driver.find_elements(By.CSS_SELECTOR, "li.breadcrumb-item a")
            category = category_elements[-1].text if category_elements else "N/A"
            
            # Extract date
            date_element = driver.find_element(By.CSS_SELECTOR, "span.date-post")
            date = date_element.text if date_element else "N/A"
            
            results.append((url, title, category, date))
            
            # Wait to avoid overwhelming the server
            time.sleep(2)
            
        except Exception as e:
            print(f"Error extracting data from {url}: {e}")
    
    return results

# Save results to a CSV file
def save_results(results, file_path):
    df = pd.DataFrame(results, columns=['URL', 'Title', 'Category', 'Date'])
    df.to_csv(file_path, index=False)
    return df

# Main execution
try:
    urls = read_urls('extracted_urls.txt')
    results = extract_data(urls)
    df = save_results(results, 'extracted_data.csv')
    
    # Print the DataFrame
    print(df)
    
except Exception as e:
    print(f"Error: {e}")

# Close the browser
driver.quit()


Processing URL: https://en.hespress.com/25337-morocco-to-participate-in-the-junior-track-cycling-world-championships.html
Processing URL: https://en.hespress.com/51635-visa-for-music-reveals-long-awaited-artist-lineup.html
Processing URL: https://en.hespress.com/45858-migration-moroccos-policy-highlighted-in-montreal.html
Processing URL: https://en.hespress.com/58176-afcon-2025-ideal-arch-favorite-candidate-african-media-admires-moroccos-infrastructural-cultural-potential.html
Processing URL: https://en.hespress.com/67200-govrin-back-as-head-of-israeli-liaison-office-in-morocco.html
Processing URL: https://en.hespress.com/39134-african-development-bank-grants-morocco-over-usd-1-billion-for-economic-relaunch.html
Processing URL: https://en.hespress.com/83759-weather-alert-morocco-forecasts-thunderstorms-gusty-wings-on-saturday-in-several-provinces.html
Processing URL: https://en.hespress.com/35373-morocco-in-can-vahid-halilhodzics-wrong-moves-put-him-under-fire.html
Processing URL: http

In [1]:
import pandas as pd

df = pd.read_csv('extracted_data.csv')

In [2]:
df.columns

Index(['URL', 'Title', 'Category', 'Date'], dtype='object')

In [3]:
df_filtered = df[df['Date'].str.contains(str(2024), na=False)]  # Filter rows containing the year


In [4]:
df_filtered

Unnamed: 0,URL,Title,Category,Date
6,https://en.hespress.com/83759-weather-alert-mo...,Weather Alert: Morocco Forecasts Thunderstorms...,HESPRESS English – Morocco News,Friday 19 April 2024 - 13:50
9,https://en.hespress.com/79829-moroccan-mps-rej...,Moroccan MPs reject Algerian amendments at Uni...,Politics,Thursday 15 February 2024 - 16:03
23,https://en.hespress.com/87386-moroccos-hilale-...,"Morocco's Hilale Launches Global Campaign ""fro...",HESPRESS English – Morocco News,Wednesday 3 July 2024 - 09:42
25,https://en.hespress.com/83709-portuguese-compa...,Portuguese companies eye Moroccan market ahead...,HESPRESS English – Morocco News,Thursday 18 April 2024 - 20:01
32,https://en.hespress.com/82804-ministry-denies-...,Ministry Denies Subsidies to Ryanair for Domes...,HESPRESS English – Morocco News,Saturday 6 April 2024 - 09:21
...,...,...,...,...
477,https://en.hespress.com/78369-moroccan-agricul...,Moroccan Agriculture Minister participates in ...,General,Friday 19 January 2024 - 18:49
485,https://en.hespress.com/85604-morocco-turns-to...,Morocco turns to themed experiences to attract...,Economy,Thursday 23 May 2024 - 17:02
486,https://en.hespress.com/84386-labor-day-labor-...,"Labor Day: Labor Unions voice social demands, ...",HESPRESS English – Morocco News,Thursday 2 May 2024 - 10:00
487,https://en.hespress.com/78202-senegalese-moroc...,Senegalese-Moroccan decentralization conferenc...,HESPRESS English – Morocco News,Tuesday 16 January 2024 - 23:36


In [5]:
df_filtered = df_filtered.reset_index(drop=True)  

In [6]:
df_filtered

Unnamed: 0,URL,Title,Category,Date
0,https://en.hespress.com/83759-weather-alert-mo...,Weather Alert: Morocco Forecasts Thunderstorms...,HESPRESS English – Morocco News,Friday 19 April 2024 - 13:50
1,https://en.hespress.com/79829-moroccan-mps-rej...,Moroccan MPs reject Algerian amendments at Uni...,Politics,Thursday 15 February 2024 - 16:03
2,https://en.hespress.com/87386-moroccos-hilale-...,"Morocco's Hilale Launches Global Campaign ""fro...",HESPRESS English – Morocco News,Wednesday 3 July 2024 - 09:42
3,https://en.hespress.com/83709-portuguese-compa...,Portuguese companies eye Moroccan market ahead...,HESPRESS English – Morocco News,Thursday 18 April 2024 - 20:01
4,https://en.hespress.com/82804-ministry-denies-...,Ministry Denies Subsidies to Ryanair for Domes...,HESPRESS English – Morocco News,Saturday 6 April 2024 - 09:21
...,...,...,...,...
74,https://en.hespress.com/78369-moroccan-agricul...,Moroccan Agriculture Minister participates in ...,General,Friday 19 January 2024 - 18:49
75,https://en.hespress.com/85604-morocco-turns-to...,Morocco turns to themed experiences to attract...,Economy,Thursday 23 May 2024 - 17:02
76,https://en.hespress.com/84386-labor-day-labor-...,"Labor Day: Labor Unions voice social demands, ...",HESPRESS English – Morocco News,Thursday 2 May 2024 - 10:00
77,https://en.hespress.com/78202-senegalese-moroc...,Senegalese-Moroccan decentralization conferenc...,HESPRESS English – Morocco News,Tuesday 16 January 2024 - 23:36


In [7]:
def clean_and_convert_date(date_str):
    # Remove the time part
    date_str = date_str.split(' - ')[0]
    
    # Convert to datetime format
    date_obj = pd.to_datetime(date_str, format='%A %d %B %Y')
    
    return date_obj

# Apply the function to the 'Date' column
df_filtered['Date'] = df_filtered['Date'].apply(clean_and_convert_date)

In [8]:
df_filtered

Unnamed: 0,URL,Title,Category,Date
0,https://en.hespress.com/83759-weather-alert-mo...,Weather Alert: Morocco Forecasts Thunderstorms...,HESPRESS English – Morocco News,2024-04-19
1,https://en.hespress.com/79829-moroccan-mps-rej...,Moroccan MPs reject Algerian amendments at Uni...,Politics,2024-02-15
2,https://en.hespress.com/87386-moroccos-hilale-...,"Morocco's Hilale Launches Global Campaign ""fro...",HESPRESS English – Morocco News,2024-07-03
3,https://en.hespress.com/83709-portuguese-compa...,Portuguese companies eye Moroccan market ahead...,HESPRESS English – Morocco News,2024-04-18
4,https://en.hespress.com/82804-ministry-denies-...,Ministry Denies Subsidies to Ryanair for Domes...,HESPRESS English – Morocco News,2024-04-06
...,...,...,...,...
74,https://en.hespress.com/78369-moroccan-agricul...,Moroccan Agriculture Minister participates in ...,General,2024-01-19
75,https://en.hespress.com/85604-morocco-turns-to...,Morocco turns to themed experiences to attract...,Economy,2024-05-23
76,https://en.hespress.com/84386-labor-day-labor-...,"Labor Day: Labor Unions voice social demands, ...",HESPRESS English – Morocco News,2024-05-02
77,https://en.hespress.com/78202-senegalese-moroc...,Senegalese-Moroccan decentralization conferenc...,HESPRESS English – Morocco News,2024-01-16


In [9]:
df_filtered = df_filtered.set_index(df_filtered['Date'])

In [10]:
df_filtered = df_filtered.sort_index()

In [11]:
df_filtered

Unnamed: 0_level_0,URL,Title,Category,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-01,https://en.hespress.com/77405-marrakechs-iconi...,Marrakech's Iconic Jardin Majorelle Celebrates...,General,2024-01-01
2024-01-01,https://en.hespress.com/77389-survey-93-of-mor...,Survey: 93% of Moroccans support Palestinian r...,Society,2024-01-01
2024-01-06,https://en.hespress.com/77680-moroccan-tiktoks...,"Moroccan TikTok's unhinged challenges, child e...",HESPRESS English – Morocco News,2024-01-06
2024-01-09,https://en.hespress.com/77819-polisario-leqder...,Polisario leader calls for Iranian support ami...,HESPRESS English – Morocco News,2024-01-09
2024-01-12,https://en.hespress.com/77968-moroccan-activis...,Moroccan activists request King to intervene a...,HESPRESS English – Morocco News,2024-01-12
...,...,...,...,...
2024-07-24,https://en.hespress.com/88458-funding-faciliti...,"Funding, facilities, and future stars: What's ...",Sports,2024-07-24
2024-07-25,https://en.hespress.com/88496-argentina-files-...,Argentina files complaint with FIFA after 'cha...,Sports,2024-07-25
2024-07-25,https://en.hespress.com/88517-as-2030-world-cu...,"As 2030 World Cup approaches, Morocco’s street...",Society,2024-07-25
2024-07-28,https://en.hespress.com/88655-under-royal-impe...,"Under Royal Impetus, Sports Elevated to Nation...",Politics,2024-07-28


In [12]:
df_filtered = df_filtered.drop(columns=['URL','Date'])

In [13]:
df_filtered

Unnamed: 0_level_0,Title,Category
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-01,Marrakech's Iconic Jardin Majorelle Celebrates...,General
2024-01-01,Survey: 93% of Moroccans support Palestinian r...,Society
2024-01-06,"Moroccan TikTok's unhinged challenges, child e...",HESPRESS English – Morocco News
2024-01-09,Polisario leader calls for Iranian support ami...,HESPRESS English – Morocco News
2024-01-12,Moroccan activists request King to intervene a...,HESPRESS English – Morocco News
...,...,...
2024-07-24,"Funding, facilities, and future stars: What's ...",Sports
2024-07-25,Argentina files complaint with FIFA after 'cha...,Sports
2024-07-25,"As 2030 World Cup approaches, Morocco’s street...",Society
2024-07-28,"Under Royal Impetus, Sports Elevated to Nation...",Politics
