## Update recent news headlines and perform sentiment analysis with FinBERT

### Web Scraping of news headlines

In [2]:
import time
from datetime import datetime, timedelta
import pandas as pd
import finnhub

api_key = 'your_api_key'
finnhub_client = finnhub.Client(api_key=api_key)

def fetch_news_articles(symbol, start_date, end_date):
    try:
        news = finnhub_client.company_news(symbol, _from=start_date, to=end_date)
        return news
    except Exception:  # Catch any exception
        print(f"Error fetching news for {start_date} to {end_date}. Skipping...")
        return []

symbol = 'AAPL'
start_date = datetime.today() - timedelta(days=366)
end_date = datetime.today()
delta = timedelta(days=2)

# Loop to fetch news articles in delta-day intervals
current_start_date = start_date
week_counter = 0
all_news = []

while current_start_date < end_date:
    current_end_date = current_start_date + delta
    if current_end_date > end_date:
        current_end_date = end_date
    
    news_articles = fetch_news_articles(symbol, current_start_date.strftime('%Y-%m-%d'), current_end_date.strftime('%Y-%m-%d'))
    all_news.extend(news_articles)
    
    if (current_start_date - start_date).days % 7 == 0:
        week_counter += 1
        print(f"Finished fetching data for week {week_counter} starting from {current_start_date.strftime('%Y-%m-%d')}")
    
    current_start_date = current_end_date
    time.sleep(1)  # To avoid exceeding the API rate limit

# Convert the news data to a DataFrame
news_df = pd.DataFrame(all_news)
news_df['datetime'] = news_df['datetime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
news_df.drop_duplicates(inplace=True)

# Save File
today_date = datetime.today().strftime('%Y-%m-%d')
filename = f"{symbol}_NEWS_{today_date}.csv"
news_df.to_csv(filename, index=False)
print(f"News headlines saved to {filename}")

Finished fetching data for week 1 starting from 2023-07-13
Finished fetching data for week 2 starting from 2023-07-27
Finished fetching data for week 3 starting from 2023-08-10
Finished fetching data for week 4 starting from 2023-08-24
Finished fetching data for week 5 starting from 2023-09-07
Finished fetching data for week 6 starting from 2023-09-21
Finished fetching data for week 7 starting from 2023-10-05
Finished fetching data for week 8 starting from 2023-10-19
Finished fetching data for week 9 starting from 2023-11-02
Finished fetching data for week 10 starting from 2023-11-16
Finished fetching data for week 11 starting from 2023-11-30
Finished fetching data for week 12 starting from 2023-12-14
Finished fetching data for week 13 starting from 2023-12-28
Finished fetching data for week 14 starting from 2024-01-11
Finished fetching data for week 15 starting from 2024-01-25
Finished fetching data for week 16 starting from 2024-02-08
Finished fetching data for week 17 starting from 

### Sentiment Analysis using FinBERT

In [7]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification

has_gpu = torch.cuda.is_available()
has_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
device = torch.device("mps" if has_mps else "cuda" if has_gpu else "cpu")

print(f"PyTorch Version: {torch.__version__}")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")

# Load the saved news data
sym = 'AAPL'
date = '2024-07-13'
file_path = f'../data/{sym}_NEWS_{date}.csv'
df = pd.read_csv(file_path)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3).to(device)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1)[:, None])
    return e_x / np.sum(e_x, axis=1)[:, None]

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    logits = outputs.logits.detach().cpu().numpy()
    probs = softmax(logits)
    probs = np.squeeze(probs)   # [positive, negative, neutral]
    return probs[0] - probs[1]  # positive - negative

# Apply sentiment analysis with progress bar
tqdm.pandas()
df['sentiment'] = df['headline'].progress_apply(get_sentiment_score)

# Compute the average sentiment score and headline count for each date
summary = df.groupby('datetime').agg(
    avg_sentiment=('sentiment', 'mean'),
    count=('headline', 'count')
).reset_index().sort_values(by='datetime')

# Save the sentiment summary to a CSV file
summary_file = f"{sym}_sentiment_{date}.csv"
summary.to_csv(summary_file, index=False)
print(f"Sentiment summary saved to {summary_file}")

100%|██████████| 17191/17191 [07:21<00:00, 38.97it/s]

Sentiment summary saved to AAPL_sentiment_2024-07-13.csv





In [8]:
from datetime import datetime, timedelta

start_date = datetime.today() - timedelta(days=366)
end_date = datetime.today()
date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y-%m-%d')

sym = 'AAPL'
sentiment_date = '2024-07-13'
sentiment_file_path = f'../data/{sym}_sentiment_{sentiment_date}.csv'
sentiment_df = pd.read_csv(sentiment_file_path)

# Check for missing dates in the sentiment data
missing_dates = pd.to_datetime(date_range).difference(pd.to_datetime(sentiment_df['datetime']))
print("Missing dates in sentiment data:", missing_dates)

Missing dates in sentiment data: DatetimeIndex(['2023-07-13', '2023-07-14', '2023-07-15', '2023-07-16',
               '2023-07-17', '2023-07-18', '2024-07-13'],
              dtype='datetime64[ns]', freq=None)


### Merge Sentiment and Numerical Data

In [11]:
import pandas as pd
from datetime import datetime

# Define symbol and dates
sym = 'AAPL'
stock_date = '2024-07-11'
sentiment_date = '2024-07-13'

# Load the historical stock price data
stock_file_path = f'../data/{sym}_{stock_date}.csv'
stock_df = pd.read_csv(stock_file_path)

# Load the sentiment summary data
sentiment_file_path = f'../data/{sym}_sentiment_{sentiment_date}.csv'
sentiment_df = pd.read_csv(sentiment_file_path)

# Merge the two dataframes by date
merged_df = pd.merge(stock_df, sentiment_df, left_on='Date', right_on='datetime')

# Drop the redundant 'datetime' column from the merged dataframe
merged_df.drop(columns=['datetime'], inplace=True)

# Save the merged dataframe to a new file
last_date = merged_df['Date'].max()
merge_file_path = f'{sym}_Merge_{last_date}.csv'
merged_df.to_csv(merge_file_path, index=False)

print(f"Merged data saved to {merge_file_path}")

Merged data saved to AAPL_Merge_2024-07-10.csv
