In [285]:


import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
import random
import scipy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import undetected_chromedriver as uc
import time
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer for sentence splitting
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to C:\Users\Richie
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Initialize CrudeBERT tokenizer and model
config_path = './crude_bert_config.json' 
model_path = './crude_bert_model.bin'
config = AutoConfig.from_pretrained(config_path)
model = AutoModelForSequenceClassification.from_config(config)
state_dict = torch.load(model_path)
state_dict.pop("bert.embeddings.position_ids", None)
model.load_state_dict(state_dict, strict=False) # Using strict=False to ignore non-critical mismatches
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

  state_dict = torch.load(model_path)


In [297]:
# Function to output sentiment scores using model
def predict_sentiment_for_weekly_df(weekly_df, model, tokenizer):
    model.eval()
    results = []
    class_names = ["negative", "neutral", "positive"]

    # Iterate through each row in the weekly DataFrame
    for _, row in weekly_df.iterrows():
        week_date = row['week_date']
        press_releases = row['press_releases']
        
        # If there are press releases for this week, perform sentiment analysis
        if pd.notna(press_releases):
            combined_text = " ".join(press_releases)
            
            # Split the combined_text into individual sentences
            sentences = sent_tokenize(combined_text)
            
            # Store the sentiment scores for each sentence
            sentence_scores = []
            
            for sentence in sentences:

                # Tokenize and encode the sentence
                inputs = tokenizer(
                    sentence, 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=512
                )
                
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    softmax_scores = torch.nn.functional.softmax(logits, dim=-1)
                    print(softmax_scores)
                    # Calculate sentiment score for this sentence
                    negative_score = softmax_scores[0][0].item()
                    neutral_score = softmax_scores[0][1].item()
                    positive_score = softmax_scores[0][2].item()
                    
                    # Sentiment score ranges from -1 (negative) to 1 (positive)
                    sentiment_score = (-10 * negative_score) + (0 * neutral_score) + (10 * positive_score)
                    sentence_scores.append(sentiment_score)
                    print(sentiment_score)
            # Average the sentiment scores across sentences
            avg_sentiment_score = sum(sentence_scores) / len(sentence_scores)
            
            # Append the weekly data with sentiment result
            results.append([week_date, press_releases, avg_sentiment_score])
        else:
            # If no press releases, keep NaN for sentiment
            results.append([week_date, press_releases, np.nan])

    # Create a DataFrame with week_date, press_releases, and sentiment columns
    sentiment_df = pd.DataFrame(results, columns=["week_date", "press_releases", "sentiment"])
    return sentiment_df

In [266]:

options = uc.ChromeOptions()
options.add_argument("--start-maximized")
driver = uc.Chrome(options=options)

# 1 page for each year
main_pages = [
    "https://www.opec.org/opec_web/en/6287.htm",
    "https://www.opec.org/opec_web/en/6762.htm",
    "https://www.opec.org/opec_web/en/7093.htm",
    "https://www.opec.org/opec_web/en/7278.htm",
]


all_urls = []

# Iterate through each page and collect article links for each press release
for main_page in main_pages:
    driver.get(main_page)
    time.sleep(2)  

    # Locate all article links on the main page and store their URLs
    article_links = driver.find_elements(By.CSS_SELECTOR, "div.article h3 a")
    all_urls.extend([link.get_attribute("href") for link in article_links])


all_articles = []

# Loop through each article URL, open the page, extract content, and move to the next
for url in all_urls:
    driver.get(url)
    time.sleep(2)  

    try:
        # Extract headline
        headline = driver.find_element(By.CLASS_NAME, "headline").text

        # Extract slim text
        slim_texts = driver.find_elements(By.CLASS_NAME, "slim")
        slim_content = " ".join([slim.text for slim in slim_texts])

        # Extract date (third line in the p.date element)
        date_element = driver.find_element(By.CLASS_NAME, "date").text.splitlines()
        date = date_element[2] if len(date_element) > 2 else "Date not found"

        # Append each article's details as a dictionary
        all_articles.append({
            "date": date,
            "text": f"{headline}\n{slim_content}"
        })

    except Exception as e:
        print(f"Error occurred on {url}: {e}")

driver.quit()


Error occurred on https://www.opec.org/opec_web/en/press_room/7377.htm: Message: no such element: Unable to locate element: {"method":"css selector","selector":".headline"}
  (Session info: chrome=130.0.6723.91); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00DE38B3+24035]
	(No symbol) [0x00D6BC44]
	(No symbol) [0x00C4C2D3]
	(No symbol) [0x00C8DC86]
	(No symbol) [0x00C8DECB]
	(No symbol) [0x00CCB9D2]
	(No symbol) [0x00CAFED4]
	(No symbol) [0x00CC953F]
	(No symbol) [0x00CAFC26]
	(No symbol) [0x00C8218C]
	(No symbol) [0x00C8310D]
	GetHandleVerifier [0x010896D3+2800643]
	GetHandleVerifier [0x010E428E+3172286]
	GetHandleVerifier [0x010DCEA2+3142610]
	GetHandleVerifier [0x00E86C60+692624]
	(No symbol) [0x00D74C5D]
	(No symbol) [0x00D71968]
	(No symbol) [0x00D71B00]
	(No symbol) [0x00D63FB0]
	BaseThreadInitThunk [0x76D8FA29+25]
	RtlGetAppContainerNamedObjectPa

In [268]:
df = pd.DataFrame(all_articles)  
df['date'] = pd.to_datetime(df['date'])
df.head()
df.to_csv("opec.csv", index=False)

In [273]:
# Merge by dates
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2024-10-13')  # Set explicit end date to October 13th, 2024

weekly_ranges = pd.date_range(start=start_date, end=end_date, freq='W-SUN')  # Use W-SUN to ensure weeks end on Sunday

# Create a new DataFrame for the result
weekly_df = pd.DataFrame(weekly_ranges, columns=['week_date'])
weekly_df['press_releases'] = np.nan

# Populate 'press_releases' with concatenated text of articles within each weekly range
for i in range(len(weekly_ranges) - 1):
    start = weekly_ranges[i]
    end = weekly_ranges[i + 1]
    
    # Filter articles within the current weekly range
    articles_in_week = df[(df['date'] >= start) & (df['date'] < end)]
    
    # Concatenate all texts into a single string or NaN if no articles
    if not articles_in_week.empty:
        weekly_text = " ".join(articles_in_week['text'].tolist())
        weekly_df.at[i, 'press_releases'] = weekly_text
    else:
        weekly_df.at[i, 'press_releases'] = np.nan

# Display the result
weekly_df.head()


No 02/2021
Vienna, Austria
05 Jan 2021
The 13th OPEC and non-OPEC Ministerial Meeting (ONOMM), held via videoconference, concluded on Tuesday, 5 January 2021.
The Meeting, which reconvened following an initial round of discussions on 4 January, reaffirmed the continued commitment of the participating countries in the Declaration of Cooperation (DoC) to a stable market in the mutual interest of producing nations; the efficient, economic and secure supply to consumers; and a fair return on invested capital.
In addition, the Meeting recalled the decision taken by all DoC participating countries at the 10th (Extraordinary) ONOMM on 12 April 2020 to adjust downwards overall crude oil production, the unanimous decisions taken at the 11th ONOMM on 6 June 2020, and the outcomes of the 12th ONOMM on 3 December 2020.
The Meeting highlighted the unprecedented events of 2020 and shocking impact of the COVID-19 pandemic on the world economy and markets, and commended the DoC participating countries

Unnamed: 0,week_date,press_releases
0,2021-01-03,13th OPEC and non-OPEC Ministerial Meeting con...
1,2021-01-10,
2,2021-01-17,
3,2021-01-24,
4,2021-01-31,Kazakhstan conveys its full commitment to comp...


In [296]:
sentiment_df = predict_sentiment_for_weekly_df(weekly_df.tail(2), model, tokenizer)

sentiment_df

tensor([[0.0156, 0.9829, 0.0014]])
-0.014201764948666096
tensor([[0.0160, 0.9826, 0.0014]])
-0.014568963670171797
tensor([[0.0153, 0.9836, 0.0012]])
-0.01413851126562804
tensor([[0.0169, 0.9817, 0.0013]])
-0.01559196412563324
tensor([[0.0115, 0.9871, 0.0014]])
-0.0101417115656659
tensor([[0.0144, 0.9841, 0.0015]])
-0.012959280982613564
tensor([[0.0243, 0.9733, 0.0024]])
-0.02187654795125127
tensor([[0.0177, 0.9812, 0.0011]])
-0.01661712198983878
tensor([[0.0122, 0.9861, 0.0017]])
-0.010461708647198975
tensor([[0.0121, 0.9858, 0.0021]])
-0.009969721781089902
tensor([[0.0408, 0.9486, 0.0106]])
-0.030191874131560326
tensor([[0.0160, 0.9823, 0.0017]])
-0.014272570027969778
tensor([[0.0159, 0.9829, 0.0012]])
-0.014722755178809166
tensor([[1.5742e-02, 9.8329e-01, 9.6514e-04]])
-0.014777097559999675
tensor([[0.0408, 0.9486, 0.0106]])
-0.030191874131560326
tensor([[0.0203, 0.9749, 0.0048]])
-0.015588427428156137
tensor([[1.6556e-02, 9.8249e-01, 9.5069e-04]])
-0.015605784545186907
tensor([[0.01

Unnamed: 0,week_date,press_releases,sentiment
0,2024-10-06,Fifth High-Level Meeting of the OPEC-GECF Ener...,-0.015557
1,2024-10-13,NaT,
