#Data collection, preprocessing, and analysis

##Collect news dataset from NYT

In [None]:
pip install pynytimes

Collecting pynytimes
  Downloading pynytimes-0.10.0-py3-none-any.whl (20 kB)
Installing collected packages: pynytimes
Successfully installed pynytimes-0.10.0


In [None]:
from pynytimes import NYTAPI
import datetime
import pandas as pd
import numpy as np


def get_news(year, month, day):
    """
    get top 10 most relevent finance news headings on each day from NY times
    """
    nyt = NYTAPI("5UI21WrJdSgZtHZpljOncwS0qMuJuOcs", parse_dates=True)
    list = []
    articles = nyt.article_search(
            results = 10,
            dates = {
                "begin": datetime.datetime(year, month, day),
                "end": datetime.datetime(year, month, day)
            },
            options = {
                "sort": "relevance",
                "news_desk": [
                    "Business", "Business Day", "Entrepreneurs", "Financial", "Technology"
                ],
                "section_name" : [
                    "Business", "Business Day", "Technology"
                ]
            }
        )
    for i in range(len(articles)):
        list.append(articles[i]['abstract'].replace(',', ""))
    return list

df = pd.DataFrame()



def generate_news_file():
    """
    store news headings everyday of part by part due to scraping limit per day
    """
    # start = '2018-12-01'#done
    # end = '2020-03-30'
    # start = '2020-03-31'#done
    # end = '2020-09-30'
    # start = '2020-10-01'#done
    # end = '2022-09-30'
    # start = '2022-10-01'done
    # end = '2023-12-31'
    mydates = pd.date_range(start, end)
    dates = []
    for i in range(len(mydates)):
        dates.append(mydates[i].strftime("%Y-%m-%d"))
    matrix = np.zeros((len(dates) + 1, 11), dtype=object)
    matrix[0, 0] = "Date"

    for i in range(10):
        matrix[0, i + 1] = f"News {i + 1}"
    for i in range(len(dates)):
        matrix[i + 1, 0] = dates[i]
        y, m, d = dates[i].split("-")
        news_list = get_news(int(y), int(m), int(d))
        for j in range(len(news_list)):
            matrix[i + 1, j + 1] = news_list[j]
    df = pd.DataFrame(matrix)
    df.to_csv("news.csv", index = False)


generate_news_file()

##Collect dataset of Stock Market (S&P500)

In [None]:
import pandas as pd

def download_stock_data(ticker, start, end):
    """
    download stock price data from Yahoo Finance
    """
    import yfinance as yf
    stock_data = yf.download(ticker, start, end)
    df = pd.DataFrame(stock_data)
    df.to_csv("stock_price.csv")

download_stock_data("^GSPC", "2018-12-01", "2023-12-31")

[*********************100%%**********************]  1 of 1 completed


##Clean news data that not exist in stock market history

In [None]:
import pandas as pd
import numpy as np

news_df = pd.read_csv("news.csv", header=None)
stock_df = pd.read_csv("stock_price.csv")
# Set the column names to the values in the second row
news_df.columns = news_df.iloc[1]

# Drop the first two rows (original header and the row used for the new header)
news_df = news_df.drop([0, 1])

# Reset the index
news_df = news_df.reset_index(drop=True)

print(news_df.head(2))
for i in range(len(stock_df)):
    date = stock_df['Date'][i][:10]
    stock_df['Date'][i] = date

news_df = news_df[news_df['Date'].isin(stock_df['Date'].tolist())]

news_df.to_csv("news_data.csv", index=False)

1        Date                                             News 1  \
0  2020-03-31  The clamor for corporate funding is raising co...   
1  2020-04-01  Restrictions that follow current C.D.C. guidel...   

1                                             News 2  \
0  Public health officials have been pushing airl...   
1  The new T-Mobile will have over 100 million cu...   

1                                             News 3  \
0  The bank said it would make diversity training...   
1  The mayor of New York who is often disdainful ...   

1                                             News 4  \
0  The CNN anchor 49 is “feeling well” and will c...   
1  The Treasury reversed guidance that would have...   

1                                             News 5  \
0  The use of gasoline and other fuels is droppin...   
1  The central bank is weighing what it can do to...   

1                                             News 6  \
0  A $35 million plan to redevelop one of Louisvi...   
1  Welc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['Date'][i] = date


##Apply FINBERT sentiment analysis to the news

In [None]:
import pandas as pd
from tqdm import tqdm  # Import tqdm
def FinBERT_sentiment_score(heading):
    """
    compute sentiment score using pretrained FinBERT on -1 to 1 scale. -1 being negative and 1 being positive
    """
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from transformers import pipeline
    tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
    finbert = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
    result = nlp(heading)
    if result[0]['label'] == "positive":
        return result[0]['score']
    elif result[0]['label'] == "neutral":
        return 0
    else:
        return (0 - result[0]['score'])


news_df = pd.read_csv("news_data.csv")
news_df.fillna(0, inplace=True)

BERT_sentiment = []

# Use tqdm to create a progress bar
for i in tqdm(range(len(news_df)), desc="Processing"):
    news_list = news_df.iloc[i, 1:].tolist()
    news_list = [str(i) for i in news_list if i != 0]
    score_BERT = FinBERT_sentiment_score(news_list)
    BERT_sentiment.append(score_BERT)


news_df['FinBERT score'] = BERT_sentiment

news_df.to_csv("sentiment.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Processing: 100%|██████████| 128/128 [08:16<00:00,  3.88s/it]


##Merge All preprocessed news after FINBERT sentiment analysis

In [None]:
import pandas as pd

# Replace 'file1.csv' and 'file2.csv' with the actual filenames of your CSV files
file1 = pd.read_csv('sentiment0&1.1.csv')
file2 = pd.read_csv('sentiment1.2&2.csv')

# Stack the two DataFrames vertically
stacked_df = pd.concat([file1, file2], ignore_index=True)
stacked_df.to_csv('sentimentAllNews.csv', index=False)
