In [None]:
# Financial News Sentiment – Reproducible Report
"""This notebook regenerates all results, but due to time-sensitive financial and news data, outputs may not exactly match the figures in research_outputs/
This notebook pulls together all the tables, metrics and plots used in the project and the research paper.  
Everything here is generated from the CSV files stored in:

- `research_outputs/tables/`
- `research_outputs/stats/`

The goal is simple:  
Make the analysis **fully reproducible** without needing API calls or re-running the entire pipeline.

The notebook is split into the following sections:

1. Setup & small utility helpers  
2. Coverage of the dataset (tickers, date range, number of news items)  
3. Sentiment and event-level summary tables  
4. Model performance and backtest metrics  
5. Statistical tests  
6. Final test fold evaluation (ROC, PR, confusion matrix)  
7. SHAP feature importance  
8. Sentiment decay analysis

Take your time exploring each section — everything is meant to be clear and easy to follow."""


In [1]:
# 1. SETUP & SMALL HELPERS

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    precision_recall_curve,
    auc,
    classification_report,
)

sns.set_theme(style="whitegrid")

# Adjust this if your notebook sits somewhere else
ROOT = Path("..").resolve()

TABLES = ROOT / "research_outputs" / "tables"
STATS = ROOT / "research_outputs" / "stats"
FIGS = ROOT / "research_outputs" / "figures"

TABLES, STATS, FIGS


(WindowsPath('C:/Users/Asus/Downloads/financial-sentiment-nlp/research_outputs/tables'),
 WindowsPath('C:/Users/Asus/Downloads/financial-sentiment-nlp/research_outputs/stats'),
 WindowsPath('C:/Users/Asus/Downloads/financial-sentiment-nlp/research_outputs/figures'))

In [8]:
import pickle

model_path = r"C:\Users\Asus\Downloads\financial-sentiment-nlp\models\catboost_best.pkl"

with open(model_path, "rb") as f:
    model = pickle.load(f)

print("Model features:", model.feature_names_[:50])
print("Total:", len(model.feature_names_))



Model features: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41']
Total: 42
