In [241]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
import random
import scipy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import undetected_chromedriver as uc
import time

In [261]:
# Initialize FinBERT tokenizer and model
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_sentiment_for_weekly_df(weekly_df, model, tokenizer):
    model.eval()
    results = []
    class_names = ["negative", "neutral", "positive"]

    # Iterate through each row in the weekly DataFrame
    for _, row in weekly_df.iterrows():
        week_date = row['week_date']
        press_releases = row['press_releases']
        
        # If there are press releases for this week, perform sentiment analysis
        if pd.notna(press_releases):
            combined_text = " ".join(press_releases)
            
            # Split the combined_text into chunks of max 512 tokens
            inputs = tokenizer(combined_text, return_tensors="pt", padding=True, truncation=True, max_length=512, return_overflowing_tokens=True)
            
            # Store the sentiment scores for each chunk
            chunk_scores = []
            
            for i in range(inputs.input_ids.size(0)): 
                with torch.no_grad():
                    outputs = model(input_ids=inputs.input_ids[i].unsqueeze(0), attention_mask=inputs.attention_mask[i].unsqueeze(0))
                    logits = outputs.logits
                    softmax_scores = torch.nn.functional.softmax(logits, dim=-1)
                    
                    # Calculate sentiment score for this chunk
                    sentiment_score = (softmax_scores[0][0].item() * -1) + (softmax_scores[0][2].item() * 1) +  (softmax_scores[0][1].item() * 0)
                    chunk_scores.append(sentiment_score)


            avg_sentiment_score = sum(chunk_scores) / len(chunk_scores)
            
            # Append the weekly data with sentiment result
            results.append([week_date, press_releases, avg_sentiment_score])
        else:
            # If no press releases, keep NaN for sentiment
            results.append([week_date, press_releases, np.nan])

    # Create a DataFrame with week_date, press_releases, and sentiment columns
    sentiment_df = pd.DataFrame(results, columns=["week_date", "press_releases", "sentiment"])
    return sentiment_df


In [266]:

options = uc.ChromeOptions()
options.add_argument("--start-maximized")
driver = uc.Chrome(options=options)

# 1 page for each year
main_pages = [
    "https://www.opec.org/opec_web/en/6287.htm",
    "https://www.opec.org/opec_web/en/6762.htm",
    "https://www.opec.org/opec_web/en/7093.htm",
    "https://www.opec.org/opec_web/en/7278.htm",
]


all_urls = []

# Iterate through each page and collect article links for each press release
for main_page in main_pages:
    driver.get(main_page)
    time.sleep(2)  

    # Locate all article links on the main page and store their URLs
    article_links = driver.find_elements(By.CSS_SELECTOR, "div.article h3 a")
    all_urls.extend([link.get_attribute("href") for link in article_links])


all_articles = []

# Loop through each article URL, open the page, extract content, and move to the next
for url in all_urls:
    driver.get(url)
    time.sleep(2)  

    try:
        # Extract headline
        headline = driver.find_element(By.CLASS_NAME, "headline").text

        # Extract slim text
        slim_texts = driver.find_elements(By.CLASS_NAME, "slim")
        slim_content = " ".join([slim.text for slim in slim_texts])

        # Extract date (third line in the p.date element)
        date_element = driver.find_element(By.CLASS_NAME, "date").text.splitlines()
        date = date_element[2] if len(date_element) > 2 else "Date not found"

        # Append each article's details as a dictionary
        all_articles.append({
            "date": date,
            "text": f"{headline}\n{slim_content}"
        })

    except Exception as e:
        print(f"Error occurred on {url}: {e}")

driver.quit()


Error occurred on https://www.opec.org/opec_web/en/press_room/7377.htm: Message: no such element: Unable to locate element: {"method":"css selector","selector":".headline"}
  (Session info: chrome=130.0.6723.91); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00DE38B3+24035]
	(No symbol) [0x00D6BC44]
	(No symbol) [0x00C4C2D3]
	(No symbol) [0x00C8DC86]
	(No symbol) [0x00C8DECB]
	(No symbol) [0x00CCB9D2]
	(No symbol) [0x00CAFED4]
	(No symbol) [0x00CC953F]
	(No symbol) [0x00CAFC26]
	(No symbol) [0x00C8218C]
	(No symbol) [0x00C8310D]
	GetHandleVerifier [0x010896D3+2800643]
	GetHandleVerifier [0x010E428E+3172286]
	GetHandleVerifier [0x010DCEA2+3142610]
	GetHandleVerifier [0x00E86C60+692624]
	(No symbol) [0x00D74C5D]
	(No symbol) [0x00D71968]
	(No symbol) [0x00D71B00]
	(No symbol) [0x00D63FB0]
	BaseThreadInitThunk [0x76D8FA29+25]
	RtlGetAppContainerNamedObjectPa

In [268]:
df = pd.DataFrame(all_articles)  
df['date'] = pd.to_datetime(df['date'])
df.head()
df.to_csv("opec.csv", index=False)

In [273]:
# Merge by dates
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2024-10-13')  # Set explicit end date to October 13th, 2024

weekly_ranges = pd.date_range(start=start_date, end=end_date, freq='W-SUN')  # Use W-SUN to ensure weeks end on Sunday

# Create a new DataFrame for the result
weekly_df = pd.DataFrame(weekly_ranges, columns=['week_date'])
weekly_df['press_releases'] = np.nan

# Populate 'press_releases' with concatenated text of articles within each weekly range
for i in range(len(weekly_ranges) - 1):
    start = weekly_ranges[i]
    end = weekly_ranges[i + 1]
    
    # Filter articles within the current weekly range
    articles_in_week = df[(df['date'] >= start) & (df['date'] < end)]
    
    # Concatenate all texts into a single string or NaN if no articles
    if not articles_in_week.empty:
        weekly_text = " ".join(articles_in_week['text'].tolist())
        weekly_df.at[i, 'press_releases'] = weekly_text
    else:
        weekly_df.at[i, 'press_releases'] = np.nan

# Display the result
weekly_df.head()


No 02/2021
Vienna, Austria
05 Jan 2021
The 13th OPEC and non-OPEC Ministerial Meeting (ONOMM), held via videoconference, concluded on Tuesday, 5 January 2021.
The Meeting, which reconvened following an initial round of discussions on 4 January, reaffirmed the continued commitment of the participating countries in the Declaration of Cooperation (DoC) to a stable market in the mutual interest of producing nations; the efficient, economic and secure supply to consumers; and a fair return on invested capital.
In addition, the Meeting recalled the decision taken by all DoC participating countries at the 10th (Extraordinary) ONOMM on 12 April 2020 to adjust downwards overall crude oil production, the unanimous decisions taken at the 11th ONOMM on 6 June 2020, and the outcomes of the 12th ONOMM on 3 December 2020.
The Meeting highlighted the unprecedented events of 2020 and shocking impact of the COVID-19 pandemic on the world economy and markets, and commended the DoC participating countries

Unnamed: 0,week_date,press_releases
0,2021-01-03,13th OPEC and non-OPEC Ministerial Meeting con...
1,2021-01-10,
2,2021-01-17,
3,2021-01-24,
4,2021-01-31,Kazakhstan conveys its full commitment to comp...


In [274]:
sentiment_df = predict_sentiment_for_weekly_df(weekly_df.tail(2), model, tokenizer)

sentiment_df

Unnamed: 0,week_date,press_releases,sentiment
0,2024-10-06,Fifth High-Level Meeting of the OPEC-GECF Ener...,0.858531
1,2024-10-13,NaT,
