# 1. Data collection

In [1]:
!pip install requests



In [69]:
import requests
import os
import pandas as pd
from datetime import datetime, timedelta
import random
import time
from kaggle_secrets import UserSecretsClient

In [70]:
# API Key
api_key = UserSecretsClient().get_secret("NEWS_API_KEY")

# Base URL
base_url = "https://newsapi.org/v2/everything"

# Query
query = "gold OR XAU/USD OR gold price"

# Create list of available days (past 28 days to today)
today = datetime.now()
available_days = [today - timedelta(days=i) for i in range(1, 28)]

# Randomly pick 30 different days
chosen_days = random.sample(available_days, len(available_days))

# Storage
all_articles = []

# Fetch 5 articles for each chosen day
for day in chosen_days:
    day_str = day.strftime("%Y-%m-%d")
    
    params = {
        "q": query,
        "from": day_str,
        "to": day_str,
        "sortBy": "publishedAt",
        "language": "en",
        "pageSize": 5,   # Only ask for 5 articles per day
        "page": 1,
        "apiKey": api_key
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code != 200:
        print(f"Failed for {day_str}: {response.status_code}, {response.text}")
        continue
    
    data = response.json()
    articles = data.get("articles", [])
    
    if articles:
        for article in articles:
            all_articles.append({
                "publishedAt": article.get("publishedAt"),
                "title": article.get("title"),
                "source": article.get("source", {}).get("name"),
                "url": article.get("url"),
                "day": day_str
            })

    time.sleep(1)  # Gentle pause to avoid hitting API limits

print(f"Total articles collected: {len(all_articles)}")



Total articles collected: 132


In [73]:
# Create DataFrame
df_news = pd.DataFrame(all_articles)
# Save
os.makedirs("data/raw", exist_ok=True)
df_news.to_csv("data/raw/gold_news.csv", index=False)

In [72]:
df_news['publishedAt'] = pd.to_datetime(df_news['publishedAt'])
print(df_news['publishedAt'].dt.date.value_counts())

publishedAt
2025-04-07    5
2025-04-21    5
2025-04-15    5
2025-04-08    5
2025-04-13    5
2025-04-24    5
2025-04-04    5
2025-04-19    5
2025-04-18    5
2025-04-16    5
2025-04-02    5
2025-04-14    5
2025-04-23    5
2025-04-06    5
2025-03-30    5
2025-04-09    5
2025-04-05    5
2025-04-20    5
2025-04-01    5
2025-04-10    5
2025-04-03    5
2025-03-29    5
2025-04-22    5
2025-04-12    5
2025-03-31    4
2025-04-11    4
2025-04-17    4
Name: count, dtype: int64


# 2.  Sentiment Analysis on Headlines

In [16]:
%%capture
!pip install transformers #A HuggingFace library to load pretrained NLP models easily.
!pip install torch  #PyTorch for model inference


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [17]:
from transformers import AutoTokenizer # splits text into tokens that models can understand.
from transformers import AutoModelForSequenceClassification #Loads a model that can classify a sequence (your headline) into a label (positive, neutral, negative).
import torch


In [75]:
df=pd.read_csv("/kaggle/input/sentimented-data-xd/gold_news(1).csv")

In [77]:
# Load FinBERT (pretrained for financial sentiment)
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") #Downloads the FinBERT model called "yiyanghkust/finbert-tone" from HuggingFace.
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [78]:
# Prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_labels = ['negative', 'neutral', 'positive']
    pred_label = sentiment_labels[probs.argmax()]
    return pred_label

In [79]:
# Apply sentiment analysis to each title
df['sentiment'] = df['title'].apply(predict_sentiment)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [80]:
# Create the directory if it doesn't exist
os.makedirs("data/processed", exist_ok=True)

# Save the dataframe
df.to_csv("data/processed/gold_news_with_sentiment.csv", index=False)

In [81]:
df.head()

Unnamed: 0,publishedAt,title,source,url,day,sentiment
0,2025-04-07 23:58:56+00:00,"As U.S. Tariffs Loom Over the Comic Industry, ...",Screen Rant,https://screenrant.com/dc-comics-tariff-prices...,2025-04-07,negative
1,2025-04-07 23:56:22+00:00,Why I'm keeping a close watch on the Fortescue...,Motley Fool Australia,https://www.fool.com.au/2025/04/08/why-im-keep...,2025-04-07,negative
2,2025-04-07 23:54:46+00:00,Nifty precariously close to key support levels,The Times of India,https://economictimes.indiatimes.com/markets/s...,2025-04-07,negative
3,2025-04-07 23:53:31+00:00,Detroit Tigers fan says he paid more for parki...,FOX 2 Detroit,https://www.fox2detroit.com/news/detroit-tiger...,2025-04-07,negative
4,2025-04-07 23:52:53+00:00,Apple Customers Hurry to Buy iPhones Before Ap...,Mactrast.com,https://www.mactrast.com/2025/04/apple-custome...,2025-04-07,positive


# 3. Feature Engineering 

In [82]:
df=pd.read_csv("/kaggle/input/news-with-sentiment/gold_news_with_sentiment(1).csv")

In [83]:
# Convert 'publishedAt' to datetime
df['publishedAt'] = pd.to_datetime(df['publishedAt'])

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   publishedAt  132 non-null    datetime64[ns, UTC]
 1   title        132 non-null    object             
 2   source       132 non-null    object             
 3   url          132 non-null    object             
 4   day          132 non-null    object             
 5   sentiment    132 non-null    object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 6.3+ KB


In [85]:
# Create a 'date' column (remove hours)
df['date'] = df['publishedAt'].dt.date


In [86]:
df.head()

Unnamed: 0,publishedAt,title,source,url,day,sentiment,date
0,2025-04-07 23:58:56+00:00,"As U.S. Tariffs Loom Over the Comic Industry, ...",Screen Rant,https://screenrant.com/dc-comics-tariff-prices...,2025-04-07,negative,2025-04-07
1,2025-04-07 23:56:22+00:00,Why I'm keeping a close watch on the Fortescue...,Motley Fool Australia,https://www.fool.com.au/2025/04/08/why-im-keep...,2025-04-07,negative,2025-04-07
2,2025-04-07 23:54:46+00:00,Nifty precariously close to key support levels,The Times of India,https://economictimes.indiatimes.com/markets/s...,2025-04-07,negative,2025-04-07
3,2025-04-07 23:53:31+00:00,Detroit Tigers fan says he paid more for parki...,FOX 2 Detroit,https://www.fox2detroit.com/news/detroit-tiger...,2025-04-07,negative,2025-04-07
4,2025-04-07 23:52:53+00:00,Apple Customers Hurry to Buy iPhones Before Ap...,Mactrast.com,https://www.mactrast.com/2025/04/apple-custome...,2025-04-07,positive,2025-04-07


In [87]:
# Map sentiments to numerical values
sentiment_map = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}
df['sentiment_score'] = df['sentiment'].map(sentiment_map)


In [88]:
# Group by date and aggregate
df_daily = df.groupby('date').agg(
    avg_sentiment=('sentiment_score', 'mean'),
    positive_count=('sentiment', lambda x: (x == 'positive').sum()),
    negative_count=('sentiment', lambda x: (x == 'negative').sum()),
    neutral_count=('sentiment', lambda x: (x == 'neutral').sum()),
    headline_count=('sentiment', 'count')  # total number of news that day
).reset_index()


In [89]:
# Save it
df_daily.to_csv("data/processed/daily_sentiment_features.csv", index=False)

In [90]:
df_daily

Unnamed: 0,date,avg_sentiment,positive_count,negative_count,neutral_count,headline_count
0,2025-03-29,-0.4,1,3,1,5
1,2025-03-30,-0.8,0,4,1,5
2,2025-03-31,-0.5,0,2,2,4
3,2025-04-01,-0.6,1,4,0,5
4,2025-04-02,-0.6,1,4,0,5
5,2025-04-03,-0.2,2,3,0,5
6,2025-04-04,-0.2,2,3,0,5
7,2025-04-05,-0.6,0,3,2,5
8,2025-04-06,-0.8,0,4,1,5
9,2025-04-07,-0.6,1,4,0,5
