## Data preprocessing

This notebook builds the merged news–price dataset and cleans the FNSPID headlines for downstream modeling.

In [1]:
from pathlib import Path
import pandas as pd
import sys
import importlib


NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
DATA_DIR = ROOT / "Data"
MERGED_PATH = DATA_DIR / "merged_news_prices.csv"


FORCE_REBUILD = True  
MAX_TICKERS = 100   
DOWNLOAD_MISSING = True  
DOWNLOAD_LIMIT = 1     



scripts_dir = ROOT / "scripts"
sys.path.insert(0, str(scripts_dir))

import build_dataset
importlib.reload(build_dataset)  

if FORCE_REBUILD or not MERGED_PATH.exists():
    build_dataset.main(
        target_tickers=None,
        download_missing=DOWNLOAD_MISSING,
        max_tickers=MAX_TICKERS,
        download_limit=DOWNLOAD_LIMIT,
    )
else:
    print("Using existing merged dataset; set FORCE_REBUILD=True to regenerate.")

print(f"Merged dataset location: {MERGED_PATH}")

Downloading 1 of 86 missing tickers: ['DEJ']


  dt_now = pd.Timestamp.utcnow()
$DEJ: possibly delisted; no price data found  (1d 2011-04-28 -> 2020-06-11)

1 Failed download:
['DEJ']: possibly delisted; no price data found  (1d 2011-04-28 -> 2020-06-11)


No price data returned for DEJ; skipping
Merged dataset written to D:\Financial News Sentiment Analysis\Data\merged_news_prices.csv with 81 rows
Merged dataset location: d:\Financial News Sentiment Analysis\Data\merged_news_prices.csv


### Cleaning pipeline

Steps for `Headlines`:
- Strip URLs, $TICKER strings, and finance-specific symbols.
- Tokenize, lemmatize, and lowercase.
- Remove stop words while keeping key finance terms: up, down, high, low, growth.

In [2]:
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


nltk_data_dir = os.path.join(ROOT, ".venv", "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)

if nltk_data_dir not in nltk.data.path:
    nltk.data.path.insert(0, nltk_data_dir)

resources = [
    "tokenizers/punkt",
    "tokenizers/punkt_tab", 
    "corpora/stopwords",
    "corpora/wordnet",
    "corpora/omw-1.4"
]

def download_nltk_resource(resource_path):
    resource_name = resource_path.split('/')[-1]
    try:
        nltk.data.find(resource_path)
    except LookupError:
        print(f"Downloading {resource_name}...")
        nltk.download(resource_name, download_dir=nltk_data_dir, quiet=True)


for res in resources:
    download_nltk_resource(res)


base_stop = set(stopwords.words("english"))
keep_terms = {"up", "down", "high", "low", "growth", "above", "below"}
custom_stop = base_stop - keep_terms


url_pattern = re.compile(r"https?://\S+|www\.\S+")
ticker_pattern = re.compile(r"\$[A-Za-z]{1,10}")
finance_symbol_pattern = re.compile(r"[\$€£¥%]")  
non_alpha_pattern = re.compile(r"[^a-zA-Z\s]")

lemmatizer = WordNetLemmatizer()

def clean_headline(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    
    text = url_pattern.sub(" ", text)
    text = ticker_pattern.sub(" ", text)
    text = finance_symbol_pattern.sub(" ", text)
    text = non_alpha_pattern.sub(" ", text)
    
    
    tokens = word_tokenize(text.lower())
    
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in custom_stop and len(tok) > 1]
    
    return " ".join(tokens)

Downloading punkt_tab...
Downloading wordnet...
Downloading omw-1.4...


In [3]:
# Load merged dataset
merged = pd.read_csv(MERGED_PATH, parse_dates=["Date"])
print(merged.head())

        Date Ticker        Open        High         Low       Close  \
0 2020-06-09   AAPL   83.035004   86.402496   83.002502   85.997498   
1 2020-06-09   AMZN  126.472000  131.321503  126.250000  130.042999   
2 2011-05-23    DNO   38.970001   39.090000   38.700001   38.779999   
3 2011-06-08    DNO   37.889999   37.889999   37.040001   37.389999   
4 2011-07-01    DNO   39.889999   40.160000   39.459999   39.650002   

    Adj Close     Volume                                          Headlines  \
0   83.889359  147712400  Why Apple's Stock Is Trading Higher Today Appl...   
1  130.042999  103520000  'Inside Amazon's plan to test warehouse worker...   
2   38.779999      13400      American Drivers Should Thank European Voters   
3   37.389999      38900                                   The End of OPEC?   
4   39.650002       9100  Is China's Slowdown Bullish for the Global Eco...   

   Target  
0       1  
1       1  
2       0  
3       1  
4       1  


In [4]:
merged["Headlines_clean"] = merged["Headlines"].apply(clean_headline)

print(merged[["Date", "Ticker", "Headlines", "Headlines_clean"]].head())

        Date Ticker                                          Headlines  \
0 2020-06-09   AAPL  Why Apple's Stock Is Trading Higher Today Appl...   
1 2020-06-09   AMZN  'Inside Amazon's plan to test warehouse worker...   
2 2011-05-23    DNO      American Drivers Should Thank European Voters   
3 2011-06-08    DNO                                   The End of OPEC?   
4 2011-07-01    DNO  Is China's Slowdown Bullish for the Global Eco...   

                                     Headlines_clean  
0  apple stock trading higher today apple could a...  
1  inside amazon plan test warehouse worker covid...  
2               american driver thank european voter  
3                                           end opec  
4              china slowdown bullish global economy  


In [8]:
output_path = DATA_DIR / "merged_news_prices_cleaned.csv"
merged.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to {output_path}")


Cleaned dataset saved to d:\Financial News Sentiment Analysis\Data\merged_news_prices_cleaned.csv
