### Oil Prices

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://oilprice.com/Latest-Energy-News/World-News/Page-"

news_titles = []
news_date = []
news_writer = []
news_excerpt = []

for i in range(150):
    req = requests.get(url + str(i) + ".html")
    html = req.text
    soup = BeautifulSoup(html, "html.parser")

    title_elements = soup.select(".categoryArticle__title")
    meta_elements = soup.select(".categoryArticle__meta")
    excerpt_elements = soup.select('.categoryArticle__excerpt')

    for title_element in title_elements:
        title = title_element.text.strip()
        news_titles.append(title)

    for meta_element in meta_elements:
        time, writer = meta_element.text.strip().split(' | ')
        date, _ = time.split(' at ')
        news_writer.append(writer)
        news_date.append(date)

    for excerpt_element in excerpt_elements:
        excerpt = excerpt_element.text.strip()
        news_excerpt.append(excerpt)

news = pd.DataFrame([news_titles, news_writer, news_excerpt],
                    columns = news_date,
                    index = ['title', 'writer', 'excerpt']).T

In [None]:
news

In [None]:
news.index = pd.to_datetime(news.index)

In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import nltk
from nltk.corpus import stopwords

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거

In [None]:
daily_sentiments = pd.DataFrame(columns = ["Date", "Positive", "Negative", "Neutral"])
unique_dates = news.index.unique()
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

In [None]:
text_for_date = news["title"].tolist()

tokenized_texts = [
        [word for word in tokenizer.tokenize(text) if word.lower() not in stop_words]
        for text in text_for_date]

In [None]:
tokenized_texts[0]

In [None]:
len(max(tokenized_texts, key = len))

In [None]:
import time 
start_time = time.time()

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

for date in unique_dates:    
    #text_for_date = data[data["rgs_dt"] == date]["news_smy_ifo"].tolist()
    text_for_date = news[news.index == date]["title"].tolist()

    tokenized_texts = [
        [word for word in tokenizer.tokenize(text) if word.lower() not in stop_words]
        for text in text_for_date
    ]

    max_token_length = 150

    tokenized_texts = [" ".join(tokens[:max_token_length]) for tokens in tokenized_texts]
    inputs = tokenizer(tokenized_texts, padding = True, truncation = True, return_tensors = "pt", max_length = max_token_length)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1)
    positive_prob = probabilities[:, 2].mean().item()
    negative_prob = probabilities[:, 0].mean().item()
    neutral_prob = probabilities[:, 1].mean().item()
    
    daily_sentiments = pd.concat([daily_sentiments, 
                                  pd.DataFrame({"Date": date, 
                                                "Positive": positive_prob, 
                                                "Negative": negative_prob, 
                                                "Neutral": neutral_prob}, index = ['0'])],
                                 ignore_index = True)

daily_sentiments.index = daily_sentiments.Date
daily_sentiments = daily_sentiments.iloc[:,1:]
daily_sentiments

In [None]:
daily_sentiments_excerpt = pd.DataFrame(columns = ["Date", "Positive", "Negative", "Neutral"])

text_for_date = news["excerpt"].tolist()

tokenized_texts = [
        [word for word in tokenizer.tokenize(text) if word.lower() not in stop_words]
        for text in text_for_date]

start_time = time.time()

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

for date in unique_dates:    
    #text_for_date = data[data["rgs_dt"] == date]["news_smy_ifo"].tolist()
    text_for_date = news[news.index == date]["excerpt"].tolist()

    tokenized_texts = [
        [word for word in tokenizer.tokenize(text) if word.lower() not in stop_words]
        for text in text_for_date
    ]

    max_token_length = 500

    tokenized_texts = [" ".join(tokens[:max_token_length]) for tokens in tokenized_texts]
    inputs = tokenizer(tokenized_texts, padding = True, truncation = True, return_tensors = "pt", max_length = max_token_length)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1)
    positive_prob = probabilities[:, 2].mean().item()
    negative_prob = probabilities[:, 0].mean().item()
    neutral_prob = probabilities[:, 1].mean().item()
    
    daily_sentiments_excerpt = pd.concat([daily_sentiments_excerpt, 
                                          pd.DataFrame({"Date": date, 
                                                        "Positive": positive_prob, 
                                                        "Negative": negative_prob, 
                                                        "Neutral": neutral_prob}, index = ['0'])],
                                 ignore_index = True)

daily_sentiments_excerpt.index = daily_sentiments_excerpt.Date
daily_sentiments_excerpt = daily_sentiments_excerpt.iloc[:,1:]
daily_sentiments_excerpt

In [None]:
daily_sentiments['sentimental_score'] = daily_sentiments['Positive'] - daily_sentiments['Negative'] + daily_sentiments['Neutral'] * 0.5

In [None]:
import matplotlib.pyplot as plt

plt.hist(daily_sentiments['sentimental_score'], bins = 100)
plt.show()

In [None]:
daily_sentiments_excerpt['sentimental_score'] = daily_sentiments_excerpt['Positive'] - daily_sentiments_excerpt['Negative'] + daily_sentiments_excerpt['Neutral'] * 0.5

In [None]:
plt.hist(daily_sentiments_excerpt['sentimental_score'], bins = 100)
plt.show()

In [None]:
# daily_sentiments_excerpt.to_csv('excerpt_sentimental.csv')
# daily_sentiments.to_csv('title_sentimental.csv')