# Bank of America

In [5]:
# Import all the libraries and packages
import pandas as pd
import time
import random
import string
import yfinance as yf
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')

# Define stop words
stop_words = set(stopwords.words('english'))

# Define sentiment analysis thresholds
strong_pos = 0.6
weak_pos = 0.2
neutral_lower, neutral_upper = -0.2, 0.2
weak_neg = -0.6

# Web-scape Google News articles for the given bank within the specified date range.
def scrape_news(bank_name, start_date, end_date):
    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    service = Service('/usr/local/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)

    base_url = "https://www.google.com/search?q={query}&tbm=nws&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"
    current_date = datetime.strptime(start_date, "%m-%d-%Y")
    end_date = datetime.strptime(end_date, "%m-%d-%Y")
    
    titles, dates = [], []

    while current_date <= end_date:
        next_date = current_date + timedelta(days=30)
        if next_date > end_date:
            next_date = end_date

        formatted_start = current_date.strftime('%m/%d/%Y')
        formatted_end = next_date.strftime('%m/%d/%Y')
        url = base_url.format(query=bank_name, start_date=formatted_start, end_date=formatted_end)

        for page in range(0, 100, 10):
            paginated_url = f"{url}&start={page}"
            driver.get(paginated_url)
            time.sleep(random.uniform(3, 7))  # Randomized sleep to avoid bot detection
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            page_titles = [title.text for title in soup.select('div.n0jPhd.ynAwRc.MBeuO.nDgy9d')]
            page_dates = [date.text for date in soup.select('div.OSrXXb.rbYSKb.LfVVr span')]
            
            titles.extend(page_titles)
            dates.extend(page_dates)
            
            if not page_titles:
                break
        
        current_date = next_date + timedelta(days=1)

    driver.quit()
    news_df = pd.DataFrame({'Bank': bank_name, 'Title': titles, 'Date': dates})
    news_df.to_csv(f"{bank_name}_news.csv", index=False)
    return news_df

# Example usage
news_df = scrape_news("Bank of America", "01-01-2024", "12-30-2024")
print(news_df.head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/suyeonkim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suyeonkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/suyeonkim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              Bank                                              Title  \
0  Bank of America  BofA Launches CashPro® Insights – Latest Digit...   
1  Bank of America  Bank of America profit falls on one-off charge...   
2  Bank of America  Bank of America Reports Fourth-Quarter 2023 Fi...   
3  Bank of America  Bank of America’s latest office attendance too...   
4  Bank of America  Bank of America invests in blockchain, AI in d...   

           Date  
0   Jan 9, 2024  
1  Jan 12, 2024  
2  Jan 12, 2024  
3  Jan 24, 2024  
4  Jan 29, 2024  


In [6]:
# Clean the text by lowercasing, removing punctuation, and stop words.

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
        return ' '.join(filtered_words)
    return text

# Apply cleaning function
news_df['Cleaned Text'] = news_df['Title'].apply(clean_text)
print(news_df.head())

# Perform sentiment analysis on the cleaned news article titles.

def analyze_sentiment(news_df):
    sentiment = SentimentIntensityAnalyzer()
    sentiment_data = []

    for _, row in news_df.iterrows():
        cleaned_text = row['Cleaned Text']
        sentences = nltk.tokenize.sent_tokenize(cleaned_text)
        compound_scores = [sentiment.polarity_scores(sentence)['compound'] for sentence in sentences]
        average_compound = sum(compound_scores) / len(compound_scores) if compound_scores else 0

        if average_compound > strong_pos:
            sentiment_category = "Strong Positive"
        elif average_compound > weak_pos:
            sentiment_category = "Weak Positive"
        elif neutral_lower <= average_compound <= neutral_upper:
            sentiment_category = "Neutral"
        elif average_compound >= weak_neg:
            sentiment_category = "Weak Negative"
        else:
            sentiment_category = "Strong Negative"

        sentiment_data.append([row['Title'], sentiment_category, average_compound])
    
    sentiment_df = pd.DataFrame(sentiment_data, columns=['Title', 'Sentiment', 'Average Compound Score'])
    news_df = news_df.merge(sentiment_df, on='Title', how='left')
    return news_df

news_df = analyze_sentiment(news_df)
print(news_df.head())

# Aggregate sentiment scores by date. & Compute the average sentiment score and counts different sentiment categories.
def aggregate_sentiment(news_df):
    news_df['Date'] = pd.to_datetime(news_df['Date'], errors='coerce').dt.date

    aggregated_df = news_df.groupby('Date').agg(
        bank = ('Bank', 'first'),
        avg_sentiment=('Average Compound Score', 'mean'),
        strong_positive=('Sentiment', lambda x: sum(x == 'Strong Positive')),
        weak_positive=('Sentiment', lambda x: sum(x == 'Weak Positive')),
        neutral=('Sentiment', lambda x: sum(x == 'Neutral')),
        weak_negative=('Sentiment', lambda x: sum(x == 'Weak Negative')),
        strong_negative=('Sentiment', lambda x: sum(x == 'Strong Negative'))
    ).reset_index()

    return aggregated_df

aggregated_sentiment_df = aggregate_sentiment(news_df)
print(aggregated_sentiment_df.head())

# Fetch stock closing prices for the given bank within the date range.


def fetch_stock_prices(bank_name, start_date, end_date):
    ticker_map = {
        "JPMorgan Chase": "JPM",
        "Bank of America": "BAC",
        "Citigroup": "C",
        "Wells Fargo": "WFC",
        "Goldman Sachs": "GS"
    }
    
    ticker = ticker_map.get(bank_name, None)
    
    if ticker:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        stock_data = stock_data[['Close']].reset_index()
        stock_data.rename(columns={'Close': 'Stock Price'}, inplace=True)
        stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date
        return stock_data
    else:
        print("Ticker not found!")
        return pd.DataFrame()

stock_df = fetch_stock_prices("Bank of America", "2024-01-01", "2024-12-30")
stock_df.columns = ['Date', 'Stock Price']
print(stock_df.head())



[*********************100%***********************]  1 of 1 completed

              Bank                                              Title  \
0  Bank of America  BofA Launches CashPro® Insights – Latest Digit...   
1  Bank of America  Bank of America profit falls on one-off charge...   
2  Bank of America  Bank of America Reports Fourth-Quarter 2023 Fi...   
3  Bank of America  Bank of America’s latest office attendance too...   
4  Bank of America  Bank of America invests in blockchain, AI in d...   

           Date                                       Cleaned Text  
0   Jan 9, 2024  bofa launches cashpro® insights – latest digit...  
1  Jan 12, 2024  bank america profit falls oneoff charges share...  
2  Jan 12, 2024  bank america reports fourthquarter 2023 financ...  
3  Jan 24, 2024  bank america ’ latest office attendance tool ‘...  
4  Jan 29, 2024    bank america invests blockchain ai digital push  
              Bank                                              Title  \
0  Bank of America  BofA Launches CashPro® Insights – Latest Digit...   
1




In [7]:
# Merge aggregated_sentiment_df and stock_df
def merge_data(aggregated_sentiment_df, stock_df):
    aggregated_sentiment_df['Date'] = pd.to_datetime(aggregated_sentiment_df['Date'], errors='coerce').dt.date
    stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date

    final_df = aggregated_sentiment_df.merge(stock_df, on='Date', how='left')
    final_df.to_csv("final_sentiment_stock_data.csv", index=False)
    return final_df

final_data = merge_data(aggregated_sentiment_df, stock_df)
final_data.to_csv("final_data.csv", index=False)
print(final_data.head())


         Date             bank  avg_sentiment  strong_positive  weak_positive  \
0  2024-01-01  Bank of America         0.0000                0              0   
1  2024-01-02  Bank of America         0.0000                0              0   
2  2024-01-03  Bank of America         0.7964                1              0   
3  2024-01-04  Bank of America         0.0000                0              0   
4  2024-01-05  Bank of America         0.2312                1              1   

   neutral  weak_negative  strong_negative  Stock Price  
0        2              0                0          NaN  
1        1              0                0    33.058743  
2        0              0                0    32.697918  
3        1              0                0    32.961224  
4        2              0                0    33.575588  


# JPMorgan Chase

In [8]:
news_df = scrape_news("JPMorgan Chase", "01-01-2024", "12-30-2024")
news_df['Cleaned Text'] = news_df['Title'].apply(clean_text)
news_df = analyze_sentiment(news_df)
aggregated_sentiment_df = aggregate_sentiment(news_df)
stock_df = fetch_stock_prices("JPMorgan Chase", "2024-01-01", "2024-12-30")
stock_df.columns = ['Date', 'Stock Price']
final_data2 = merge_data(aggregated_sentiment_df, stock_df)
final_data2.to_csv("final_data2.csv", index=False)
print(final_data2.head())

[*********************100%***********************]  1 of 1 completed

         Date            bank  avg_sentiment  strong_positive  weak_positive  \
0  2024-01-01  JPMorgan Chase       0.000000                0              0   
1  2024-01-02  JPMorgan Chase       0.440400                0              1   
2  2024-01-04  JPMorgan Chase      -0.489450                0              0   
3  2024-01-05  JPMorgan Chase       0.049150                0              1   
4  2024-01-09  JPMorgan Chase       0.307375                0              3   

   neutral  weak_negative  strong_negative  Stock Price  
0        1              0                0          NaN  
1        0              0                0   167.203125  
2        0              2                0   167.579132  
3        1              0                0   168.419937  
4        1              0                0   166.845917  





# Citigroup

In [9]:
news_df = scrape_news("Citigroup", "01-01-2024", "12-30-2024")
news_df['Cleaned Text'] = news_df['Title'].apply(clean_text)
news_df = analyze_sentiment(news_df)
aggregated_sentiment_df = aggregate_sentiment(news_df)
stock_df = fetch_stock_prices("Citigroup", "2024-01-01", "2024-12-30")
stock_df.columns = ['Date', 'Stock Price']
final_data3 = merge_data(aggregated_sentiment_df, stock_df)
final_data3.to_csv("final_data3.csv", index=False)
print(final_data3.head())

[*********************100%***********************]  1 of 1 completed

         Date       bank  avg_sentiment  strong_positive  weak_positive  \
0  2024-01-02  Citigroup        0.11324                0              2   
1  2024-01-04  Citigroup        0.09966                0              2   
2  2024-01-05  Citigroup        0.00000                0              0   
3  2024-01-07  Citigroup        0.00000                0              0   
4  2024-01-08  Citigroup        0.00000                0              0   

   neutral  weak_negative  strong_negative  Stock Price  
0        3              0                0    50.784542  
1        3              0                0    51.483509  
2        2              0                0    52.019695  
3        1              0                0          NaN  
4        3              0                0    51.713299  





# Goldman Sachs

In [10]:
news_df = scrape_news("Goldman Sachs", "01-01-2024", "12-30-2024")
news_df['Cleaned Text'] = news_df['Title'].apply(clean_text)
news_df = analyze_sentiment(news_df)
aggregated_sentiment_df = aggregate_sentiment(news_df)
stock_df = fetch_stock_prices("Goldman Sachs", "2024-01-01", "2024-12-30")
stock_df.columns = ['Date', 'Stock Price']
final_data4 = merge_data(aggregated_sentiment_df, stock_df)
final_data4.to_csv("final_data4.csv", index=False)
print(final_data4.head())

[*********************100%***********************]  1 of 1 completed

         Date           bank  avg_sentiment  strong_positive  weak_positive  \
0  2024-01-02  Goldman Sachs       0.000000                0              0   
1  2024-01-03  Goldman Sachs       0.090071                1              1   
2  2024-01-04  Goldman Sachs       0.000000                0              0   
3  2024-01-05  Goldman Sachs       0.000000                0              0   
4  2024-01-08  Goldman Sachs       0.163333                0              3   

   neutral  weak_negative  strong_negative  Stock Price  
0        2              0                0   379.116272  
1        4              1                0   372.760284  
2        3              0                0   373.892822  
3        3              0                0   377.300262  
4        2              1                0   379.662994  





# Wells Fargo

In [11]:
news_df = scrape_news("Wells Fargo", "01-01-2024", "12-30-2024")
news_df['Cleaned Text'] = news_df['Title'].apply(clean_text)
news_df = analyze_sentiment(news_df)
aggregated_sentiment_df = aggregate_sentiment(news_df)
stock_df = fetch_stock_prices("Wells Fargo", "2024-01-01", "2024-12-30")
stock_df.columns = ['Date', 'Stock Price']
final_data5 = merge_data(aggregated_sentiment_df, stock_df)
final_data5.to_csv("final_data5.csv", index=False)
print(final_data5.head())

[*********************100%***********************]  1 of 1 completed

         Date         bank  avg_sentiment  strong_positive  weak_positive  \
0  2024-01-02  Wells Fargo      -0.273200                0              0   
1  2024-01-03  Wells Fargo       0.053633                0              2   
2  2024-01-04  Wells Fargo       0.062750                0              4   
3  2024-01-05  Wells Fargo       0.168200                0              3   
4  2024-01-07  Wells Fargo       0.250000                0              2   

   neutral  weak_negative  strong_negative  Stock Price  
0        0              1                0    47.823631  
1        0              1                0    47.193474  
2        1              1                0    47.775150  
3        1              0                0    48.395607  
4        0              0                0          NaN  





In [12]:
combine = pd.concat([final_data, final_data2, final_data3, final_data4, final_data5])

In [None]:
# Calculate the change of Stock Price
combine['Stock Change'] = combine['Stock Price'] - combine['Stock Price'].shift(1)

# If the previous day's value is NaN, use the value from 2 days ago, then 3 days ago
combine['Stock Change'].fillna(combine['Stock Price'] - combine['Stock Price'].shift(2), inplace=True)
combine['Stock Change'].fillna(combine['Stock Price'] - combine['Stock Price'].shift(3), inplace=True)


In [17]:
# Fill missing values using forward fill (previous day's price)
combine['Calculated Stock Price'] = combine['Stock Price']
combine['Calculated Stock Price'] = combine['Calculated Stock Price'].ffill()

# Calculate daily percentage change
combine['Daily Change (%)'] = combine['Calculated Stock Price'].pct_change() * 100

In [None]:
# Save it as a file
combine.to_csv("combine.csv", index=False)
