In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import re
from datetime import datetime, timedelta
from tqdm import tqdm
import re
import yfinance as yf

In [None]:
# Define the ticker symbol for Brent Crude Oil
ticker = "BZ=F"

# Get the data for the ticker
data = yf.Ticker(ticker)

# Get the entire history data by specifying 'max' period
history = data.history(period="max")

# Check if 'Adj Close' is available, if not, use 'Close' as 'Adj Close'
if 'Adj Close' not in history.columns:
    history['Adj Close'] = history['Close']

# Select the required columns: 'Open', 'High', 'Low', 'Close', 'Adj Close', and 'Volume'
filtered_history = history[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]

# Display the filtered data
print(filtered_history)

# Optional: Save the data to a CSV file
filtered_history.to_csv("NFLX.csv")

In [2]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the PropertyGuru rental listings page
driver.get("https://oilprice.com/Energy/Crude-Oil/")
time.sleep(10)

In [3]:
def get_max_page_number(driver):
    try:
        # Find the "Last" button
        last_button = driver.find_element(By.CSS_SELECTOR, 'a.last')
        
        # Extract the href attribute
        last_page_url = last_button.get_attribute('href')
        
        # Use regular expression to extract the page number from the URL
        match = re.search(r'Page-(\d+)\.html$', last_page_url)
        if match:
            max_page_number = int(match.group(1))
            return max_page_number
        else:
            print("Could not find page number in the URL.")
            return None
    except Exception as e:
        print("Error finding the last page button:", e)
        return None
    
def scrape_oil_news(driver):
    result = []
    max_page_number = get_max_page_number(driver)
    wait = WebDriverWait(driver, 10)  # WebDriverWait instance with a 10-second timeout

    for current_page in tqdm(range(1, max_page_number + 1), desc="Scraping Pages"):
        print(f"Scraping page {current_page}/{max_page_number}")

        # Scrape articles on the current page
        try:
            # Wait for articles to be present on the page
            articles = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'categoryArticle__content')))
        except Exception as e:
            print("Error locating articles:", e)
            break

        # Loop through each article and extract the required information
        for article in articles:
            try:
                link_element = article.find_element(By.CSS_SELECTOR, 'a')
                article_url = link_element.get_attribute('href')
                article_title = article.find_element(By.CSS_SELECTOR, 'h2.categoryArticle__title').text

                # Extract date
                date_element = article.find_element(By.CSS_SELECTOR, 'p.categoryArticle__meta')
                article_date = date_element.text.split('|')[0].strip()  # Extract only the date part

                result.append({
                        "url": article_url,
                        "title": article_title,
                        "date": article_date
                    })
            except Exception as e:
                print("Error extracting article data:", e)
        print(f"Finish page {current_page}")
        # Navigate to the next page
        if current_page < max_page_number:
            next_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'next')))
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            next_button.click()
            time.sleep(2)

    return result

In [None]:
final = scrape_oil_news(driver)

In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from datetime import datetime

# Function to load the tokenizer and model
def load_model():
    try:
        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Function to perform sentiment analysis on a text input
def analyze_sentiment(text, tokenizer, model):
    try:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Perform the prediction
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the logits (model's raw output)
        logits = outputs.logits
        
        # Convert logits to sentiment prediction
        sentiment = torch.argmax(logits, dim=1).item()
        
        # Define the sentiment categories
        sentiment_map = {0: -1, 1: 1}  # Adjusting to return -1 for negative, +1 for positive
        
        # Return the corresponding sentiment score
        return sentiment_map[sentiment]
    
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return None

# Function to process the CSV and apply sentiment analysis
def process_csv(file_path):
    # Load the tokenizer and model
    tokenizer, model = load_model()
    
    # Check if the model was loaded successfully
    if tokenizer is None or model is None:
        print("Model loading failed. Exiting.")
        return
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Convert the 'date' column to datetime objects (ignoring time for aggregation purposes)
    df['date'] = pd.to_datetime(df['date'], format="%b %d, %Y at %H:%M")
    df['date'] = df['date'].dt.date  # Keep only the date part for grouping
    
    # Initialize the tqdm progress bar
    tqdm_bar = tqdm(total=len(df), desc="Analyzing sentiment", unit=" rows")
    
    # Apply sentiment analysis to each title with a progress bar
    df['sentiment_score'] = df['title'].apply(lambda x: analyze_sentiment(x, tokenizer, model))
    
    # Update the progress bar after each row is processed
    for _ in df.index:
        tqdm_bar.update(1)
    
    # Close the progress bar after completion
    tqdm_bar.close()
    
    # If any sentiment analysis failed, fill with 0 (neutral)
    df['sentiment_score'].fillna(0, inplace=True)
    
    # Aggregating data by date
    aggregated_df = df.groupby('date').agg(
        P_average=('sentiment_score', 'mean'),
        P_sum=('sentiment_score', 'sum'),
        news_count=('title', 'count')  # Counting the number of news articles
    ).reset_index()
    
    # Sort the result by date (from farthest to nearest)
    aggregated_df = aggregated_df.sort_values(by='date', ascending=False)
    
    # Save the aggregated data into a new CSV file
    output_file = 'aggregated_oil_energy_sentiment.csv'
    aggregated_df.to_csv(output_file, index=False)
    
    # Print a message when done
    print(f"New CSV file '{output_file}' has been created. Sentiment analysis completed successfully.")

# Main function to run the process
if __name__ == "__main__":
    # Path to the input CSV file
    file_path = 'oil_energy_data.csv'
    
    # Process the CSV and perform sentiment analysis
    process_csv(file_path)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Analyzing sentiment: 100%|██████████| 6600/6600 [01:55<00:00, 56.93 rows/s]  

New CSV file 'aggregated_oil_energy_sentiment.csv' has been created. Sentiment analysis completed successfully.



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment_score'].fillna(0, inplace=True)


In [26]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the Twitter (X) profile page
driver.get("https://x.com/spgcioil?lang=en")
time.sleep(5)  # Allow the page to load

# Function to scroll the page and load more tweets
def scroll_down(driver, scroll_pause_time=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for new tweets to load
        time.sleep(scroll_pause_time)
        
        # Calculate new scroll height and compare it with the last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # If the page height hasn't changed, break the loop
        last_height = new_height

# Function to scrape tweets until we get 1000
def scrape_1000_tweets(driver, tweet_limit=1000):
    result = []
    wait = WebDriverWait(driver, 10)

    while len(result) < tweet_limit:
        # Scroll down the page to load more tweets
        scroll_down(driver)

        # Wait until tweet elements are loaded
        try:
            tweets = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'article')))
        except Exception as e:
            print("Error locating tweets:", e)
            break

        # Extract information from each tweet
        for tweet in tweets:
            if len(result) >= tweet_limit:
                break  # Stop if we've already collected enough tweets
            try:
                # Extract tweet content
                tweet_content = tweet.find_element(By.CSS_SELECTOR, 'div[lang]').text
                
                # Extract the timestamp
                timestamp_element = tweet.find_element(By.TAG_NAME, 'time')
                timestamp = timestamp_element.get_attribute('datetime')

                # Append to the result if the tweet content is not already in the list
                result.append({
                    "content": tweet_content,
                    "timestamp": timestamp
                })
            except Exception as e:
                print("Error extracting tweet data:", e)
        
        print(f"Collected {len(result)} tweets so far.")

    return result

# Scrape 1000 tweets
tweets_data = scrape_1000_tweets(driver, tweet_limit=1000)

# Close the driver
driver.quit()

# Print the number of tweets collected
print(f"Collected {len(tweets_data)} tweets.")

# Optionally, print or save the collected tweets
for tweet in tweets_data[:10]:  # Print first 10 tweets as a preview
    print(tweet)

Collected 6 tweets so far.
Collected 12 tweets so far.
Collected 18 tweets so far.
Collected 24 tweets so far.
Collected 30 tweets so far.
Collected 36 tweets so far.
Collected 42 tweets so far.
Collected 48 tweets so far.
Collected 54 tweets so far.
Collected 60 tweets so far.
Collected 66 tweets so far.
Collected 72 tweets so far.
Collected 78 tweets so far.
Collected 84 tweets so far.
Collected 90 tweets so far.
Collected 96 tweets so far.
Collected 102 tweets so far.
Collected 108 tweets so far.
Collected 114 tweets so far.
Collected 120 tweets so far.
Collected 126 tweets so far.
Collected 132 tweets so far.
Collected 138 tweets so far.
Collected 144 tweets so far.
Collected 150 tweets so far.
Collected 156 tweets so far.
Collected 162 tweets so far.
Collected 168 tweets so far.
Collected 174 tweets so far.
Collected 180 tweets so far.
Collected 186 tweets so far.
Collected 192 tweets so far.
Collected 198 tweets so far.
Collected 204 tweets so far.
Collected 210 tweets so far.
Co

In [28]:
import pandas as pd
import re
import string
import demoji

# Example of a basic stopwords list (you can expand this list based on your needs)
basic_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
    "yours", "he", "him", "his", "her", "she", "it", "its", "they", "them", 
    "their", "theirs", "what", "which", "who", "whom", "this", "that", "am", 
    "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
    "having", "do", "does", "did", "but", "if", "or", "because", "as", "until", 
    "while", "of", "at", "by", "for", "with", "about", "against", "between", 
    "into", "through", "during", "before", "after", "above", "below", "to", 
    "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "other", "some", "such", "no",
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
    "can", "will", "just", "don", "should", "now"
])

# Function to clean tweet text without NLTK and remove hashtags
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r"https?://\S+", "", text)
    
    # Remove hashtags (words starting with #)
    text = re.sub(r"#\w+", "", text)
    
    # Remove emoji descriptions
    text = demoji.replace_with_desc(text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in basic_stopwords and len(word) > 2]
    
    # Return cleaned text
    return " ".join(tokens)

# Example tweets data
tweets_data_test = [{'content': 'Expanded pipelines, port capacity connect #Permian oil wells to the world | #crudeoil #OOTT\n\n#Infographic from \n@meghangordon\n + \n@sajbelcart\n: http://plts.co/lwhR50yeMo7', 'timestamp': '2020-02-05T23:18:40.000Z'},
    {'content': "#PlattsInfographic: How much Russian #oil does Europe import?\n#Russia's war on #Ukraine has already disrupted more than 2 million b/d of #crude, #gasoil and #diesel, as official sanctions and industry avoidance crimp flows. #OOTT\n\n Get the full picture! https://okt.to/vSqb94", 'timestamp': '2022-03-21T12:38:03.000Z'},
    {'content': 'US #crudeoil, #natgas rig count drops by 47 amid extreme activity cutbacks | #coronavirus\n\n* Largest single-week drop in four years\n* #Permian takes biggest hit of 20 rigs\n* E&Ps may make further activity cuts\n\nFull story: http://plts.co/KEpm50yWUIT', 'timestamp': '2020-03-26T20:20:23.000Z'},
    {'content': '#Commodities markets face significant uncertainty going into 2023. The Russia-Ukraine war & Chinese demand are likely to play a crucial role, as #recessionfears continue to weigh on US & W. Europe. Here are the key events to watch this quarter. #OOTT #OCTT #ONGT #OATT #CERAWeek', 'timestamp': '2023-01-10T11:27:04.000Z'},
    {'content': 'Saudi Arabia is positioning itself to supply more #oil to Europe as traditional buyers of Russian barrels look to diversify ahead of looming sanctions in early December and analysis suggests the trend could become more pronounced in 2023 | https://okt.to/b1ZfzX\n\n#OOTT #crudeoil', 'timestamp': '2022-11-14T08:30:04.000Z'},
    {'content': 'A new breed of #plastic #recycling plants capable of recovering crude and fuels from plastic waste is piling more pressure on global #oil demand forecasts. Robert Perkins reports: http://plts.co/w5xs50oPYRs | #OOTT', 'timestamp': '2019-04-02T09:07:39.000Z'},
    {'content': 'Expanded pipelines, port capacity connect #Permian oil wells to the world | #crudeoil #OOTT\n\n#Infographic from \n@meghangordon\n + \n@sajbelcart\n: http://plts.co/lwhR50yeMo7', 'timestamp': '2020-02-05T23:18:40.000Z'},
    {'content': "#PlattsInfographic: How much Russian #oil does Europe import?\n#Russia's war on #Ukraine has already disrupted more than 2 million b/d of #crude, #gasoil and #diesel, as official sanctions and industry avoidance crimp flows. #OOTT\n\n Get the full picture! https://okt.to/vSqb94", 'timestamp': '2022-03-21T12:38:03.000Z'},
    {'content': 'US #crudeoil, #natgas rig count drops by 47 amid extreme activity cutbacks | #coronavirus\n\n* Largest single-week drop in four years\n* #Permian takes biggest hit of 20 rigs\n* E&Ps may make further activity cuts\n\nFull story: http://plts.co/KEpm50yWUIT', 'timestamp': '2020-03-26T20:20:23.000Z'},
    {'content': '#Commodities markets face significant uncertainty going into 2023. The Russia-Ukraine war & Chinese demand are likely to play a crucial role, as #recessionfears continue to weigh on US & W. Europe. Here are the key events to watch this quarter. #OOTT #OCTT #ONGT #OATT #CERAWeek', 'timestamp': '2023-01-10T11:27:04.000Z'},
]
# Load the oil_energy_data.csv file
oil_data = pd.read_csv('oil_energy_data.csv')

# Create a list to store cleaned tweets
cleaned_tweets = []

# Clean the tweet content and format the data
for tweet in tweets_data:
    cleaned_tweet = clean_text(tweet['content'])
    cleaned_tweets.append({
        'url': '',  # Blank for tweets
        'title': cleaned_tweet,
        'date': tweet['timestamp']
    })

# Convert cleaned tweets to DataFrame
tweets_df = pd.DataFrame(cleaned_tweets)


In [29]:
tweets_df

Unnamed: 0,url,title,date
0,,expanded pipelines port capacity connect oil w...,2020-02-05T23:18:40.000Z
1,,much russian europe import war already disrupt...,2022-03-21T12:38:03.000Z
2,,rig count drops amid extreme activity cutbacks...,2020-03-26T20:20:23.000Z
3,,markets face significant uncertainty going the...,2023-01-10T11:27:04.000Z
4,,saudi arabia positioning itself supply europe ...,2022-11-14T08:30:04.000Z
...,...,...,...
995,,new breed plants capable recovering crude and ...,2019-04-02T09:07:39.000Z
996,,expanded pipelines port capacity connect oil w...,2020-02-05T23:18:40.000Z
997,,much russian europe import war already disrupt...,2022-03-21T12:38:03.000Z
998,,rig count drops amid extreme activity cutbacks...,2020-03-26T20:20:23.000Z


In [34]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from datetime import datetime

# Function to load the tokenizer and model
def load_model():
    try:
        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Function to perform sentiment analysis on a text input
def analyze_sentiment(text, tokenizer, model):
    try:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Perform the prediction
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the logits (model's raw output)
        logits = outputs.logits
        
        # Convert logits to sentiment prediction
        sentiment = torch.argmax(logits, dim=1).item()
        
        # Define the sentiment categories
        sentiment_map = {0: -1, 1: 1}  # Adjusting to return -1 for negative, +1 for positive
        
        # Return the corresponding sentiment score
        return sentiment_map[sentiment]
    
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return None

# Function to process the CSV and apply sentiment analysis
def process_csv(file_path):
    # Load the tokenizer and model
    tokenizer, model = load_model()
    
    # Check if the model was loaded successfully
    if tokenizer is None or model is None:
        print("Model loading failed. Exiting.")
        return
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Convert the 'date' column to datetime, handling different formats
    df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert to datetime, invalid dates will be NaT
    
    # Filter out rows with dates earlier than 2019-04-02
    filter_date = pd.to_datetime("2019-04-02")
    df = df[df['date'] >= filter_date]
    
    # Initialize the tqdm progress bar
    tqdm_bar = tqdm(total=len(df), desc="Analyzing sentiment", unit=" rows")
    
    # Apply sentiment analysis to each title with a progress bar
    df['sentiment_score'] = df['title'].apply(lambda x: analyze_sentiment(x, tokenizer, model))
    
    # Update the progress bar after each row is processed
    for _ in df.index:
        tqdm_bar.update(1)
    
    # Close the progress bar after completion
    tqdm_bar.close()
    
    # If any sentiment analysis failed, fill with 0 (neutral)
    df['sentiment_score'].fillna(0, inplace=True)
    
    # Aggregating data by date
    aggregated_df = df.groupby('date').agg(
        P_average=('sentiment_score', 'mean'),
        P_sum=('sentiment_score', 'sum'),
        news_count=('title', 'count')  # Counting the number of news articles
    ).reset_index()
    
    # Sort the result by date (from farthest to nearest)
    aggregated_df = aggregated_df.sort_values(by='date', ascending=False)
    
    # Save the aggregated data into a new CSV file
    output_file = 'aggregated_oil_energy_sentiment_filtered.csv'
    aggregated_df.to_csv(output_file, index=False)
    
    # Print a message when done
    print(f"New CSV file '{output_file}' has been created. Sentiment analysis completed successfully.")

# Main function to run the process
if __name__ == "__main__":
    # Path to the input CSV file
    file_path = 'updated_oil_energy_data.csv'
    
    # Process the CSV and perform sentiment analysis
    process_csv(file_path)

Analyzing sentiment: 100%|██████████| 3339/3339 [01:07<00:00, 49.39 rows/s]

New CSV file 'aggregated_oil_energy_sentiment_filtered.csv' has been created. Sentiment analysis completed successfully.



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment_score'].fillna(0, inplace=True)


In [37]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from datetime import datetime

# Function to load the tokenizer and model
def load_model():
    try:
        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Function to perform sentiment analysis on a text input
def analyze_sentiment(text, tokenizer, model):
    try:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Perform the prediction
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the logits (model's raw output)
        logits = outputs.logits
        
        # Convert logits to sentiment prediction
        sentiment = torch.argmax(logits, dim=1).item()
        
        # Define the sentiment categories
        sentiment_map = {0: -1, 1: 1}  # Adjusting to return -1 for negative, +1 for positive
        
        # Return the corresponding sentiment score
        return sentiment_map[sentiment]
    
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return None

# Function to process the CSV and apply sentiment analysis
def process_csv(file_path):
    # Load the tokenizer and model
    tokenizer, model = load_model()
    
    # Check if the model was loaded successfully
    if tokenizer is None or model is None:
        print("Model loading failed. Exiting.")
        return
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Function to convert the date to the desired format
    def format_date(date_str):
        try:
            # First convert the string into a datetime object
            dt = pd.to_datetime(date_str, errors='coerce')
            if pd.isnull(dt):
                return None  # Return None if the date is not valid
            # Format the date to "Aug 12, 2022 at 08:31" style
            return dt.strftime("%b %d, %Y at %H:%M")
        except Exception as e:
            return None
    
    # Apply the date formatting function
    df['date'] = df['date'].apply(format_date)
    
    # Remove rows where the date conversion failed
    df = df.dropna(subset=['date'])
    
    # Filter out rows with dates earlier than 2019-04-02
    filter_date = pd.to_datetime("2019-04-02")
    df['datetime'] = pd.to_datetime(df['date'], format="%b %d, %Y at %H:%M", errors='coerce')
    df = df[df['datetime'] >= filter_date]
    
    # Initialize the tqdm progress bar
    tqdm_bar = tqdm(total=len(df), desc="Analyzing sentiment", unit=" rows")
    
    # Apply sentiment analysis to each title with a progress bar
    df['sentiment_score'] = df['title'].apply(lambda x: analyze_sentiment(x, tokenizer, model))
    
    # Update the progress bar after each row is processed
    for _ in df.index:
        tqdm_bar.update(1)
    
    # Close the progress bar after completion
    tqdm_bar.close()
    
    # If any sentiment analysis failed, fill with 0 (neutral)
    df['sentiment_score'].fillna(0, inplace=True)
    
    # Drop the 'datetime' column before saving to keep the original format
    df = df.drop(columns=['datetime'])
    
    # Sort the result by date and time
    df = df.sort_values(by='date', ascending=False)
    
    # Save the processed data into a new CSV file
    output_file = 'oil_energy_news_sentiment_per_article.csv'
    df.to_csv(output_file, index=False)
    
    # Print a message when done
    print(f"New CSV file '{output_file}' has been created. Sentiment analysis completed successfully.")

# Main function to run the process
if __name__ == "__main__":
    # Path to the input CSV file
    file_path = 'updated_oil_energy_data.csv'
    
    # Process the CSV and perform sentiment analysis
    process_csv(file_path)


Analyzing sentiment: 100%|██████████| 4339/4339 [01:21<00:00, 53.15 rows/s]

New CSV file 'oil_energy_news_sentiment_per_article.csv' has been created. Sentiment analysis completed successfully.



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment_score'].fillna(0, inplace=True)


In [38]:
import pandas as pd

# Function to process the CSV and calculate the required metrics
def post_process_csv(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Function to convert the date to the desired format
    def format_date(date_str):
        try:
            # Convert the string into a datetime object
            dt = pd.to_datetime(date_str, errors='coerce')
            if pd.isnull(dt):
                return None  # Return None if the date is not valid
            # Format the date to "YYYY-MM-DD" style
            return dt.strftime("%Y-%m-%d")
        except Exception as e:
            return None
    
    # Apply the date formatting function
    df['date'] = df['date'].apply(format_date)
    
    # Remove rows where the date conversion failed
    df = df.dropna(subset=['date'])
    
    # Filter out rows with dates earlier than 2019-04-02
    filter_date = pd.to_datetime("2019-04-02")
    df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
    df = df[df['date'] >= filter_date]
    
    # Group the data by date and calculate P_mean, P_sum, and Count_news
    aggregated_df = df.groupby('date').agg(
        P_mean=('sentiment_score', 'mean'),
        P_sum=('sentiment_score', 'sum'),
        Count_news=('sentiment_score', 'size')
    ).reset_index()
    
    # Save the aggregated data into a new CSV file
    output_file = 'aggregated_oil_energy_news_sentiment.csv'
    aggregated_df.to_csv(output_file, index=False)
    
    # Print a message when done
    print(f"New CSV file '{output_file}' has been created with aggregated sentiment data.")

# Main function to run the process
if __name__ == "__main__":
    # Path to the input CSV file
    file_path = 'oil_energy_news_sentiment_per_article.csv'
    
    # Process the CSV and calculate P_mean, P_sum, and Count_news
    post_process_csv(file_path)


New CSV file 'aggregated_oil_energy_news_sentiment.csv' has been created with aggregated sentiment data.


In [43]:
import pandas as pd

# Function to process the oil price data and merge it with the sentiment data
def process_and_merge_data(oil_price_file, sentiment_file, output_file):
    # Load the oil price data
    oil_price_df = pd.read_csv(oil_price_file)

    # Print the first few rows of the 'Date' column to help debug
    print("First few rows of 'Date' column before conversion:", oil_price_df['Date'].head())

    # Ensure the 'Date' column is properly formatted by forcing conversion to datetime
    oil_price_df['Date'] = pd.to_datetime(oil_price_df['Date'], errors='coerce', utc=True)

    # Drop any rows with invalid dates (where conversion failed)
    oil_price_df = oil_price_df.dropna(subset=['Date'])

    # Keep only the date (remove time part) after conversion
    oil_price_df['Date'] = oil_price_df['Date'].dt.date

    # Print the first few rows of 'Date' column after conversion for verification
    print("First few rows of 'Date' column after conversion:", oil_price_df['Date'].head())

    # Load the sentiment data
    sentiment_df = pd.read_csv(sentiment_file)

    # Ensure the 'date' column in the sentiment data is properly formatted
    sentiment_df['date'] = pd.to_datetime(sentiment_df['date'], errors='coerce').dt.date

    # Drop rows with invalid date values in the sentiment data
    sentiment_df = sentiment_df.dropna(subset=['date'])

    # Merge the two dataframes on the date
    merged_df = pd.merge(oil_price_df, sentiment_df, left_on='Date', right_on='date', how='inner')

    # Drop the redundant 'date' column after merging
    merged_df = merged_df.drop(columns=['date'])

    # Save the final merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)

    print(f"Final merged data has been saved to '{output_file}'.")

# Main function to run the process
if __name__ == "__main__":
    # File paths
    oil_price_file = 'oil_price.csv'
    sentiment_file = 'aggregated_oil_energy_news_sentiment.csv'
    output_file = 'final_oil_data.csv'

    # Run the process
    process_and_merge_data(oil_price_file, sentiment_file, output_file)


First few rows of 'Date' column before conversion: 0    2007-07-30 00:00:00-04:00
1    2007-07-31 00:00:00-04:00
2    2007-08-01 00:00:00-04:00
3    2007-08-02 00:00:00-04:00
4    2007-08-03 00:00:00-04:00
Name: Date, dtype: object
First few rows of 'Date' column after conversion: 0    2007-07-30
1    2007-07-31
2    2007-08-01
3    2007-08-02
4    2007-08-03
Name: Date, dtype: object
Final merged data has been saved to 'final_oil_data.csv'.
