In [2]:
import pandas as pd

In [None]:
headlines= pd.read_csv("merged_headlines_dataset_sorted.csv")

In [None]:
headlines["published"].max()
headlines

In [None]:
!pip install yfinance

In [None]:
import yfinance as yf

gold = yf.download("GC=F", start="2019-03-01", end="2025-12-31", interval="1d")
print(len(gold))
gold.to_csv("daily_gold_prices.csv")


In [None]:
gold_dum = pd.read_csv("daily_gold_prices.csv")
gold

In [None]:

col_names = ["Date", "Close", "High", "Low", "Open", "Volume"]

gold = pd.read_csv("daily_gold_prices.csv", skiprows=2, names=col_names)
gold = gold.drop(index=0)
gold

# Convert Date to datetime
#gold["Date"] = pd.to_datetime(gold["Date"])


In [None]:
headlines = headlines.sort_values(by='published', ascending=True)
headlines["published"]

In [None]:
!pip install transformers

In [None]:
!pip install torch

In [None]:
!pip install huggingface_hub

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import torch
import huggingface_hub

In [None]:
model_name = "ProsusAI/finbert"
huggingface_hub.constants.HF_HUB_HTTP_TIMEOUT = 60  # extend timeout


In [None]:
for attempt in range(5):
    try:
        print(f"\nAttempt {attempt+1}/5: Loading FinBERT model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        print("✅ FinBERT loaded successfully!")
        break
    except Exception as e:
        print("⚠️ Error:", e)
        if attempt < 4:
            print("⏳ Retrying in 5 seconds...\n")
            time.sleep(5)
        else:
            raise e


In [None]:
sentiments = []

for text in tqdm(headlines["title"].fillna(""), desc="Analyzing Sentiment"):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        scores = softmax(outputs.logits.detach().numpy()[0])
        sentiments.append({
            "negative": float(scores[0]),
            "neutral": float(scores[1]),
            "positive": float(scores[2])
        })
    except Exception as e:
        sentiments.append({"negative": np.nan, "neutral": np.nan, "positive": np.nan})
        print("⚠️ Error on text:", text, "|", e)

sent_df = pd.DataFrame(sentiments)
headline_df = pd.concat([headlines.reset_index(drop=True), sent_df], axis=1)


In [None]:
import pandas as pd

# 1. Load gold prices CSV
col_names = ["Date", "Close", "High", "Low", "Open", "Volume"]
gold = pd.read_csv("daily_gold_prices.csv", skiprows=2, names=col_names)

# Drop any unnecessary first row if needed
gold = gold.drop(index=0)

# Convert Date to datetime (MM/DD/YYYY)
gold['Date'] = pd.to_datetime(gold['Date'], dayfirst=False, errors='coerce')



# Rename date column to match gold CSV
headline_df.rename(columns={"published": "Date"}, inplace=True)

# Convert Date to datetime (MM/DD/YYYY)
headline_df['Date'] = pd.to_datetime(headline_df['Date'], dayfirst=False, errors='coerce')

# 3. Merge datasets
merged = pd.merge(gold, headline_df, on='Date', how='outer')

# 4. Sort by Date ascending-
merged = merged.sort_values('Date').reset_index(drop=True)



In [None]:
merged.to_csv("gold_sentiment_merged.csv", index=False)

In [None]:

!pip install pandas_datareader --quiet

import pandas_datareader.data as web
import pandas as pd
import datetime


start = datetime.datetime(2019, 3, 1)
end = datetime.datetime(2025, 10, 10)


interest_rate = web.DataReader("FEDFUNDS", "fred", start, end)


inflation = web.DataReader("FPCPITOTLZGUSA", "fred", start, end)


usd_index = web.DataReader("DTWEXBGS", "fred", start, end)


macro_df = pd.concat([interest_rate, inflation, usd_index], axis=1)
macro_df.columns = ["interest_rate", "inflation", "usd_index"]


macro_df = macro_df.reset_index()
macro_df.rename(columns={"index": "date"}, inplace=True)


macro_df.to_csv("macro_data.csv", index=False)

print("✅ macro_data.csv created successfully!")
print(macro_df.head())


In [None]:
import pandas as pd


gold_df = pd.read_csv("gold_sentiment_merged.csv")
macro_df = pd.read_csv("macro_data.csv")


gold_df['Date'] = pd.to_datetime(gold_df['Date'])
macro_df.rename(columns={'DATE': 'Date'}, inplace=True)
macro_df['Date'] = pd.to_datetime(macro_df['Date'])


merged_df = pd.merge(gold_df, macro_df, on='Date', how='left')


merged_df[['interest_rate', 'inflation', 'usd_index']] = (
    merged_df[['interest_rate', 'inflation', 'usd_index']].ffill().bfill()
)


merged_df.to_csv("gold_macro_sentiment.csv", index=False)


print("Rows:", merged_df.shape[0])
print(merged_df.head())


In [7]:
df = pd.read_csv('gold_macro_sentiment.csv')

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Autumn'

df['season'] = df['Date'].dt.month.apply(get_season)

In [8]:
print(df[['Date', 'season']].head())

        Date  season
0 2019-03-01  Spring
1 2019-03-01  Spring
2 2019-03-01  Spring
3 2019-03-01  Spring
4 2019-03-01  Spring


In [9]:
df.to_csv('gold_macro_sentiment.csv', index=False)

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("gold_macro_sentiment.csv")
df["Date"] = pd.to_datetime(df["Date"])

#basic date features
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["day"] = df["Date"].dt.day
df["day_of_week"] = df["Date"].dt.dayofweek     # Monday=0, Sunday=6
df["week_of_year"] = df["Date"].dt.isocalendar().week.astype(int)
df["quarter"] = df["Date"].dt.quarter
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
#trend feature
df["days_since_start"] = (df["Date"] - df["Date"].min()).dt.days
# Month cyclic encoding
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
# Day-of-week cyclic encoding
df["dayofweek_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
df["dayofweek_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)

df = df.sort_values("Date").reset_index(drop=True) #sort by date jic

#updated dataset
df.to_csv("final_dataset.csv", index=False)

print("all features added")


all features added
