<a href="https://colab.research.google.com/github/SabahiJ/DataScienceFinalProject/blob/main/Financial_Sentiment_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



In [None]:
!pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [17]:
# Phase 1: Data Collection & Ingestion
# Import required libraries
import os
import time
import pandas as pd
from datetime import datetime, timedelta
from newsapi import NewsApiClient

# API key configuration
API_KEY = "6fc24465eeb54281b7efa2114bc22223"

# Output directory
os.makedirs("data", exist_ok=True)

# Initialize NewsAPI client
newsapi = NewsApiClient(api_key=API_KEY)

# Define search query
query = "Nvidia OR NVDA"

# Define financial sentiment-related keywords
sentiment_keywords = [
    "bullish", "bearish", "uptrend", "downtrend", "rally", "plunge", "spike", "drop",
    "soar", "slump", "decline", "jump", "rebound", "surge", "fall", "gain", "beat estimates",
    "miss estimates", "guidance", "forecast", "warning", "record high", "record low",
    "growth", "slowdown", "volatile", "volatility", "recession", "earnings", "strong results",
    "disappoint", "underperform", "outperform", "loss", "profit", "buy", "sell", "upgrade", "downgrade"
]

# Function to check if a headline contains relevant sentiment keywords
def is_sentiment_relevant(headline):
    headline = str(headline).lower()
    return any(keyword in headline for keyword in sentiment_keywords)

# Initialize output container
filtered_articles = []

# Time slicing: break the last 30 days into 5 intervals of 6 days each
end_date = datetime.today()
start_date = end_date - timedelta(days=30)
interval_days = 6

while start_date < end_date:
    slice_from = start_date.strftime('%Y-%m-%d')
    slice_to = (start_date + timedelta(days=interval_days)).strftime('%Y-%m-%d')

    try:
        print(f"Fetching from {slice_from} to {slice_to}...")
        articles = newsapi.get_everything(
            q=query,
            language="en",
            from_param=slice_from,
            to=slice_to,
            sort_by="publishedAt",
            page_size=100,
            page=1
        )

        for article in articles["articles"]:
            headline = article["title"]
            if is_sentiment_relevant(headline):
                filtered_articles.append([
                    article["publishedAt"][:10],
                    "Nvidia",
                    article["source"]["name"],
                    headline,
                    article["url"]
                ])

        time.sleep(1)

    except Exception as e:
        print(f"Error fetching from {slice_from} to {slice_to}: {e}")

    # Move to next slice
    start_date += timedelta(days=interval_days)

# Convert to DataFrame
df_filtered = pd.DataFrame(filtered_articles, columns=["Date", "Stock", "Source", "Headline", "URL"])

# Save to CSV
df_filtered.to_csv("data/nvidia_news_raw.csv", index=False)
print(f"{len(df_filtered)} relevant Nvidia headlines saved to 'data/nvidia_news_raw.csv'")


Fetching from 2025-04-10 to 2025-04-16...
Fetching from 2025-04-16 to 2025-04-22...
Fetching from 2025-04-22 to 2025-04-28...
Fetching from 2025-04-28 to 2025-05-04...
Fetching from 2025-05-04 to 2025-05-10...
114 relevant Nvidia headlines saved to 'data/nvidia_news_raw.csv'


In [22]:
# Phase 1 Extension (Updated): Stock Price Collection for Nvidia

# Uses Alpha Vantage FREE endpoint (TIME_SERIES_DAILY) - Goal: Fetch prices to match T+1 sentiment data

import requests
import pandas as pd
from datetime import datetime

# Step 1: Set up Alpha Vantage API parameters
API_KEY = "PTCV59QLK39T51PL"  # Your Alpha Vantage API key
SYMBOL = "NVDA"
FUNCTION = "TIME_SERIES_DAILY"  # Free endpoint

# Step 2: Load sentiment-labeled dataset to extract headline dates
sentiment_df = pd.read_csv("data/nvidia_news_labeled.csv")
sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])

# Step 3: Define required date range (T+1 lag)
min_date = sentiment_df["Date"].min() + pd.Timedelta(days=1)
max_date = sentiment_df["Date"].max() + pd.Timedelta(days=1)

# Step 4: Request data from Alpha Vantage
url = f"https://www.alphavantage.co/query?function={FUNCTION}&symbol={SYMBOL}&apikey={API_KEY}&outputsize=full"
response = requests.get(url)
data = response.json()

# Step 5: Check for data presence
if "Time Series (Daily)" not in data:
    raise Exception(f"❌ API returned no usable data: {data.get('Note', data)}")

# Step 6: Parse relevant records only
daily_prices = data["Time Series (Daily)"]
price_data = []

for date_str, values in daily_prices.items():
    try:
        date = pd.to_datetime(date_str)
        if min_date <= date <= max_date:
            open_price = float(values["1. open"])
            close_price = float(values["4. close"])
            price_change = close_price - open_price
            price_direction = "Up" if price_change > 0 else "Down"

            price_data.append({
                "Price_Date": date.strftime("%Y-%m-%d"),
                "Open": open_price,
                "Close": close_price,
                "Price_Change": round(price_change, 2),
                "Price_Direction": price_direction
            })
    except Exception as err:
        print(f"Skipping malformed record on {date_str}: {err}")

# Step 7: Convert to DataFrame and validate
if not price_data:
    raise ValueError("❌ No data matched the sentiment date range.")

price_df = pd.DataFrame(price_data).sort_values("Price_Date")
price_df.to_csv("data/nvidia_price_data.csv", index=False)

# Step 8: Preview results
print(f"✅ Price data collected and saved to 'data/nvidia_price_data.csv' ({len(price_df)} rows)")
print(price_df.head())


✅ Price data collected and saved to 'data/nvidia_price_data.csv' (16 rows)
    Price_Date     Open   Close  Price_Change Price_Direction
15  2025-04-17  104.450  101.49         -2.96            Down
14  2025-04-21   98.770   96.91         -1.86            Down
13  2025-04-22   98.780   98.89          0.11              Up
12  2025-04-23  104.520  102.71         -1.81            Down
11  2025-04-24  103.475  106.43          2.96              Up


In [18]:
pip install transformers



In [19]:
# Phase 2: Sentiment Classification (FinBERT + VADER) - Goal: Clean financial news text and label sentiment using FinBERT and VADER

# 1. Import required libraries
import pandas as pd
import re
from transformers import pipeline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER's lexicon (used to analyze sentiment)
nltk.download("vader_lexicon")

# 2. Load Nvidia news dataset collected in Phase 1
df = pd.read_csv("/content/data/nvidia_news_raw.csv")

# 3. Text Cleaning - Define function to clean text: lowercase and remove special characters
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove punctuation and non-letter characters
    return text.strip()

# Apply cleaning function to each headline and store in new column
df["Cleaned_Headline"] = df["Headline"].apply(clean_text)

# 4. Load FinBERT model - FinBERT is a transformer model trained on financial news and reports
# It classifies text into positive, negative, or neutral
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# 5. Load VADER sentiment analyzer - VADER is a rule-based model designed for general short text sentiment analysis
vader = SentimentIntensityAnalyzer()

# 6. Define a function to interpret VADER's compound score as a sentiment label
# - score > 0.05 is positive
# - score < -0.05 is negative
# - in between is neutral
def get_vader_sentiment(text):
    score = vader.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

# 7. Apply both models to the cleaned headlines - For each headline, assign sentiment using FinBERT (financial NLP)
# and VADER (general rule-based lexicon)
df["FinBERT_Sentiment"] = df["Cleaned_Headline"].apply(lambda x: finbert(x)[0]["label"])
df["VADER_Sentiment"] = df["Cleaned_Headline"].apply(get_vader_sentiment)

# 8. Save the labeled dataset
# Export as CSV to be used in future stages such as analysis and comparison
df.to_csv("data/nvidia_news_labeled.csv", index=False)

# 9. Output sentiment distributions for both models
# This helps assess how each model is labeling the dataset overall
print("FinBERT Sentiment Distribution:")
print(df["FinBERT_Sentiment"].value_counts())

print("\nVADER Sentiment Distribution:")
print(df["VADER_Sentiment"].value_counts())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


FinBERT Sentiment Distribution:
FinBERT_Sentiment
neutral     43
negative    42
positive    29
Name: count, dtype: int64

VADER Sentiment Distribution:
VADER_Sentiment
negative    42
neutral     37
positive    35
Name: count, dtype: int64
