In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# List of URLs for Dec 2024, Jan 2025, and Feb 2025.
urls = [
    "https://www.forexfactory.com/calendar?month=dec.2024",
    "https://www.forexfactory.com/calendar?month=jan.2025",
    "https://www.forexfactory.com/calendar?month=feb.2025"
]

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/91.0.4472.124 Safari/537.36"
    )
}

data = []

impact_mapping = {
    "icon--ff-impact-red": "High",
    "icon--ff-impact-yel": "Medium",
    "icon--ff-impact-gry": "Low",
}

def parse_date_from_text(date_text, year):
    """
    Given a date string like "Dec 1" and a year (e.g., "2024"),
    return a date string in the format "YYYY-MM-DD".
    """
    try:
        parsed = datetime.strptime(date_text + " " + year, "%b %d %Y")
        return parsed.strftime("%Y-%m-%d")
    except Exception:
        return None

# Loop over each URL in the list.
for url in urls:
    # Extract the month parameter from the URL (e.g., "jan.2025") and split it.
    try:
        month_param = url.split("month=")[1]
        month_part, year_part = month_param.split(".")
    except Exception as e:
        print("Error extracting month and year from url:", url, e)
        continue

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="calendar__table")
    if table is None:
        print("Table not found for url:", url)
        continue

    current_date = None
    current_time = None  # Store the last non-empty time for the current day

    for row in table.find_all("tr"):
        classes = row.get("class", [])

        # Identify date rows that mark a new day.
        if "calendar__row--day-breaker" in classes:
            td = row.find("td", class_="calendar__cell")
            if td:
                span = td.find("span")
                if span:
                    date_text = span.text.strip()  # e.g., "Dec 1"
                    new_date = parse_date_from_text(date_text, year_part)
                    if new_date:
                        current_date = new_date
                        current_time = None  # Reset time when a new day starts
            continue  # Skip further processing for date rows

        # Some rows include a timestamp attribute to update the date.
        if row.has_attr("data-day-dateline"):
            try:
                timestamp = int(row["data-day-dateline"])
                event_date = datetime.fromtimestamp(timestamp)
                new_date = event_date.strftime("%Y-%m-%d")
                if new_date != current_date:
                    current_date = new_date
                    current_time = None
            except Exception:
                pass

        # Process event rows (rows whose class starts with "calendar__row--")
        if any(cls.startswith("calendar__row--") for cls in classes):
            try:
                # Handle possible merged cells for time.
                time_cell = row.find("td", class_="calendar__time")
                time_text = time_cell.text.strip() if time_cell else ""
                if time_text:
                    current_time = time_text
                else:
                    time_text = current_time if current_time else "N/A"

                currency = row.find("td", class_="calendar__currency").text.strip()

                # Determine impact from the impact cell.
                impact = "N/A"
                impact_td = row.find("td", class_="calendar__impact")
                if impact_td:
                    impact_span = impact_td.find("span")
                    if impact_span:
                        for cls in impact_span.get("class", []):
                            if cls in impact_mapping:
                                impact = impact_mapping[cls]
                                break

                event = row.find("td", class_="calendar__event").text.strip()
                actual = row.find("td", class_="calendar__actual").text.strip()
                forecast = row.find("td", class_="calendar__forecast").text.strip()
                previous = row.find("td", class_="calendar__previous").text.strip()

                # Use current_date directly.
                final_date = current_date

                data.append([
                    final_date,
                    time_text,
                    currency,
                    impact,
                    event,
                    actual,
                    forecast,
                    previous
                ])
            except AttributeError:
                continue

# Create the dataframe from all the accumulated data.
full_df = pd.DataFrame(data, columns=["Date", "Time", "Currency", "Impact", "Event", "Actual", "Forecast", "Previous"])

# Filter for USD and High Impact events.
filtered_df = full_df[(full_df["Currency"] == "USD") & (full_df["Impact"] == "High")].reset_index(drop=True)

# Additional filtering:
# Remove rows where "Actual", "Forecast", and "Previous" are empty.
filtered_df = filtered_df[(filtered_df["Actual"] != "") & 
                          (filtered_df["Forecast"] != "") & 
                          (filtered_df["Previous"] != "")]

# Correcting the index
filtered_df.reset_index(drop=True, inplace=True)
# Save the datasets.
full_df.to_csv("full_calendar_data.csv", index=False)
filtered_df.to_csv("filtered_usd_high_impact.csv", index=False)

Full dataset sample:
         Date     Time Currency  Impact                       Event Actual  \
0  2024-12-01  10:45pm      NZD  Medium       Building Consents m/m  -5.2%   
1  2024-12-02  12:50am      JPY  Medium        Capital Spending q/y   8.1%   
2  2024-12-02   1:00am      AUD  Medium      MI Inflation Gauge m/m   0.2%   
3  2024-12-02   1:30am      AUD     N/A            Retail Sales m/m   0.6%   
4  2024-12-02   1:30am      AUD  Medium  ANZ Job Advertisements m/m  -1.3%   

  Forecast Previous  
0              2.4%  
1     6.7%     7.4%  
2              0.3%  
3     0.4%     0.1%  
4              0.7%  

Filtered dataset (USD/High Impact & Non-empty macroindicators):
         Date    Time Currency Impact                           Event Actual  \
0  2024-12-02  4:00pm      USD   High           ISM Manufacturing PMI   48.4   
1  2024-12-03  4:00pm      USD   High              JOLTS Job Openings  7.74M   
2  2024-12-04  2:15pm      USD   High  ADP Non-Farm Employment Change   1