In [None]:
import mediacloud.api
import datetime as dt
import yfinance as yf
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from datetime import datetime, timedelta
import pandas as pd
import os

# Init the api key and load the finBERT model
MC_API_KEY = "Your_MC_API_KEY"
US_NATIONAL_COLLECTION = 34412234
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', model_max_length=512)
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
# generate a date range between the start and end
def get_date_range(start_date, end_date):
    date_range = []
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    while current_date <= end_date:
        date_range.append(current_date.strftime('%Y-%m-%d'))
        current_date += timedelta(days=1)
    return date_range

# Call the newsapi to generate a single day's news title
def news_data(stock, date):
  mc_search = mediacloud.api.SearchApi(MC_API_KEY)
  all_stories = []
  pagination_token = None
  more_stories = True
  # Get the Corpornate name as the input for the Media Cloud API
  stock = yf.Ticker(ticker).info['shortName']
  inputDate = dt.datetime.strptime(date, "%Y-%m-%d").date()
  while more_stories:
      page, pagination_token = mc_search.story_list(stock, start_date=inputDate, end_date=inputDate, collection_ids=[US_NATIONAL_COLLECTION])
      all_stories += page
      more_stories = pagination_token is not None

  text = ''
  for story in all_stories[:5]:
      if story['title'] is not None:
        text = text + story['title']
  return text

# Use the news title as the input for the finBERT model
def bert_analyze(text):
  inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

  with torch.no_grad():
      outputs = model(**inputs)

  logits = outputs.logits

  predicted_class = torch.argmax(logits, dim=1).item()
  sentiment = ['neutral', 'positive', 'negative'][predicted_class]

  return sentiment

def news_bert_analyze(tickers, start_date, end_date):
  df = pd.DataFrame()
  date_range = get_date_range(start_date, end_date)
  for ticker in tickers:
    date_dict = {}
    for date in date_range:
      sentiment = bert_analyze(news_data(ticker, date))
      date_dict[date] = str(sentiment)
    df[ticker]=pd.Series(date_dict)

  df.index.name = 'Datetime'
  # Replace the sensiment with the Int number range from -1 to 1
  df.replace({'neutral': 0, 'positive': 1, 'negative': -1}, inplace=True)
  return (df)

In [None]:
directory = "data"
parent_dir = "/content/drive/MyDrive"
dir = os.path.join(parent_dir, directory)

tickers = [
                "AAPL", "ABBV", "ABT", "ACN", "ADBE", "AIG", "AMD", "AMGN", "AMT", "AMZN",
                "AVGO", "AXP", "BA", "BAC", "BK", "BKNG", "BLK", "BMY", "BRK-B", "C",
                "CAT", "CHTR", "CL", "CMCSA", "COF", "COP", "COST", "CRM", "CSCO", "CVS",
                "CVX", "DE", "DHR", "DIS", "DOW", "DUK", "EMR", "F", "FDX", "GD",
                "GE", "GILD", "GM", "GOOG", "GOOGL", "GS", "HD", "HON", "IBM", "INTC",
                "INTU", "JNJ", "JPM", "KHC", "KO", "LIN", "LLY", "LMT", "LOW", "MA",
                "MCD", "MDLZ", "MDT", "MET", "META", "MMM", "MO", "MRK", "MS", "MSFT",
                "NEE", "NFLX", "NKE", "NVDA", "ORCL", "PEP", "PFE", "PG", "PM", "PYPL",
                "QCOM", "RTX", "SBUX", "SCHW", "SO", "SPG", "T", "TGT", "TMO", "TMUS",
                "TSLA", "TXN", "UNH", "UNP", "UPS", "USB", "V", "VZ", "WFC", "WMT",
                "XOM"
            ]

start_date = '2022-01-01'
end_date = '2023/12/31'
filename = "sentiment_data.csv"
path = os.path.join(dir, filename)
df = news_bert_analyze(tickers=tickers, start_date=start_date, end_date=end_date)
df.to_csv(path, index=True)