In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import ollama  # LLaMA model API
import re
from tqdm import tqdm  # For progress bars
from concurrent.futures import ThreadPoolExecutor, as_completed  # For parallel processing
from gnews import GNews  # Import the GNews package
from pytz import UTC

### Downloading historic asset price from yfinance

In [3]:
# Fetch Historical Prices
def fetch_all_historical_prices(asset_symbol, start_date, end_date):
    try:
        data = yf.download(
            asset_symbol,
            start=start_date.strftime('%Y-%m-%d'),
            end=end_date.strftime('%Y-%m-%d'),
            progress=False
        )
        if data.empty:
            raise ValueError(f"No data found for {asset_symbol} between {start_date} and {end_date}.")
        data = data[['Close']].rename(columns={'Close': asset_symbol})
        data.columns = [asset_symbol]
        return data
    except Exception as e:
        print(f"Error fetching data for {asset_symbol}: {e}")
        return pd.DataFrame()

### Downloading historic asset news from Gnews

In [5]:
# Fetch News using GNews
def fetch_all_google_news(query, start_date, end_date):
    gnews = GNews(language='en', country='US', max_results=5)  # Limit to top 5 headlines
    all_news_list = []

    # Generate date ranges
    date_ranges = pd.date_range(start_date, end_date, freq='D')

    for date in tqdm(date_ranges, desc="Fetching news data"):
        date_str = date.strftime('%Y-%m-%d')
        try:
            # Fetch articles for the specific date
            articles = gnews.get_news(f"{query} after:{date_str} before:{(date + timedelta(days=1)).strftime('%Y-%m-%d')}")
            if articles:
                unique_titles = [article.get('title', 'No Title') for article in articles[:5]]  # Top 5 headlines
                news_content = " ".join(unique_titles)
                news_content = news_content[:500]  # Limit news content to 500 characters
                all_news_list.append({
                    "Date": date,
                    "News": news_content
                })
        except Exception as e:
            print(f"Error fetching news for {date_str}: {e}")

    # Create DataFrame from the list
    all_news = pd.DataFrame(all_news_list)
    all_news.drop_duplicates(subset=["Date"], inplace=True)
    all_news.set_index('Date', inplace=True)
    return all_news

### Load data locally or Fetch Data from yfinance & Gnews

In [7]:
# Load or Fetch Data
def load_or_fetch_data(asset_symbol, query, start_date, end_date):
    # Fetch or load historical prices
    if not os.path.exists(f"{asset_symbol}_historical_prices.csv"):
        print("Fetching historical prices...")
        historical_prices = fetch_all_historical_prices(asset_symbol, start_date, end_date)
        historical_prices.to_csv(f"{asset_symbol}_historical_prices.csv", index=True)
        print(f"Historical prices saved to '{asset_symbol}_historical_prices.csv'.")
    else:
        print("Loading existing historical prices...")
        historical_prices = pd.read_csv(
            f"{asset_symbol}_historical_prices.csv",
            parse_dates=["Date"],
            index_col="Date"
        )

        # Ensure the correct column name
        if asset_symbol not in historical_prices.columns:
            raise ValueError(f"The expected column '{asset_symbol}' was not found in '{asset_symbol}_historical_prices.csv'.")

        historical_prices = historical_prices[[asset_symbol]]  # Select the column for the asset

    # Fetch or load news data
    if not os.path.exists(f"{asset_symbol}_news_data.csv"):
        print("Fetching news data...")
        news_data = fetch_all_google_news(query, start_date, end_date)
        news_data.to_csv(f"{asset_symbol}_news_data.csv", index=True)
        print(f"News data saved to '{asset_symbol}_news_data.csv'.")
    else:
        print("Loading existing news data...")
        news_data = pd.read_csv(
            f"{asset_symbol}_news_data.csv",
            parse_dates=["Date"],
            index_col="Date"
        )

    return historical_prices, news_data

In [8]:
# Get the date using index difference.
def LastMarketOpen_DateDiff(df, curr_date, delta):
    '''Index of the dataframe is pandas datetime format. In case date is a column in dataframe use timedelta toget the difference.'''
    curr_date_index = df.index.get_loc(curr_date)
    final_date = pd.to_datetime(df.iloc[curr_date_index+(delta)].name)
    return (final_date)

### Given one slice of input data making n-simulated predictions.
- Instead of taking past m days window size, here we will take past m indexes because some stock exchanges remain closed on holidays.
- News data for each market open day should be the cummulation of all news which came after the last market day till the current day.

In [12]:
# Predict Next Day's Price with LLaMA (Probabilistic) including news
def predict_next_price_probabilistic(
    asset_symbol,
    current_date,
    historical_prices,
    news_data,
    window_size,
    n_simulations,
    quantiles=(0.05, 0.5, 0.95)
):

    # current_date_index = historical_prices.index.get_loc(current_date)
    # start_date = pd.to_datetime(historical_prices.iloc[current_date_index-(window_size - 1)].name)     # current_date - timedelta(days=window_size - 1)
    start_date = LastMarketOpen_DateDiff(historical_prices, current_date, -(window_size - 1))
    historical_window = historical_prices.loc[start_date:current_date].dropna().values.flatten()

    if len(historical_window) < window_size:
        print(f"Insufficient data for prediction on {current_date.date()}.")
        return np.nan, np.nan, np.nan, np.nan

    historical_prices_str = ", ".join(f"{x:.2f}" for x in historical_window)
    predictions = []

    # Get news data from the last market open date to current_date
    prev_hp_date = LastMarketOpen_DateDiff(historical_prices, current_date, -1)
    # curr_dt_index = news_data.index.get_loc(current_date)
    # prev_dt_index = news_data.index.get_loc(LastMarketOpen_DateDiff(historical_prices, current_date, -1))

    historical_dates = historical_prices.loc[start_date:current_date].dropna().index
    for date in historical_dates:
        if date not in news_data.index:
            news_data.loc[date]="No significant news." 
    news_window = news_data.loc[historical_dates, 'News'].dropna().values.flatten()
    news_contents = ", ".join(f"({x})" for x in news_window)
    news_current = news_data.loc[current_date, 'News'] if current_date in news_data.index else "No significant news."
    
    History_E1 = ", ".join(f"{x:.2f}" for x in historical_window[:window_size-5])
    History_E2 = ", ".join(f"{x:.2f}" for x in historical_window[1:window_size-4])
    History_E3 = ", ".join(f"{x:.2f}" for x in historical_window[2:window_size-3])
    History_E4 = ", ".join(f"{x:.2f}" for x in historical_window[3:window_size-2])
    History_E5 = ", ".join(f"{x:.2f}" for x in historical_window[4:window_size-1])
    History_O = ", ".join(f"{x:.2f}" for x in historical_window[5:window_size])
    
    print(
        f"Following examples shows past {window_size-5} days historic prices list, recent news related to {asset_symbol} as input and next price as their corresponding output:\n"
        f"Example 1: </input> [Prices:[{History_E1}], News:{news_window[-5]}], </output> {historical_window[-5]}\n"
        f"Example 2: </input> [Prices:[{History_E2}], News:{news_window[-4]}], </output> {historical_window[-4]}\n"
        f"Example 3: </input> [Prices:[{History_E3}], News:{news_window[-3]}], </output> {historical_window[-3]}\n"
        f"Example 4: </input> [Prices:[{History_E4}], News:{news_window[-2]}], </output> {historical_window[-2]}\n"
        f"Example 5: </input> [Prices:[{History_E5}], News:{news_window[-1]}], </output> {historical_window[-1]}\n\n"
        f"Based on the insights learned from the above examples, return the next price for {asset_symbol} using the following input:\n"
        f"</input> [Prices:[{History_O}], News:{news_current}]\n\n"
        f"Make sure to capture the effect of Prices and News for predicting the output\n"
        f"Provide your output as a single number (e.g., 38500.00). Do not include any text other than the number."
    )

    prompt = (
        f"Following examples shows past {window_size-5} days historic prices list, recent news related to {asset_symbol} as input and next price as their corresponding output:\n"
        f"Example 1: </input> [Prices:[{History_E1}], News:{news_window[-5]}], </output> {historical_window[-5]}\n"
        f"Example 2: </input> [Prices:[{History_E2}], News:{news_window[-4]}], </output> {historical_window[-4]}\n"
        f"Example 3: </input> [Prices:[{History_E3}], News:{news_window[-3]}], </output> {historical_window[-3]}\n"
        f"Example 4: </input> [Prices:[{History_E4}], News:{news_window[-2]}], </output> {historical_window[-2]}\n"
        f"Example 5: </input> [Prices:[{History_E5}], News:{news_window[-1]}], </output> {historical_window[-1]}\n\n"
        f"Based on the insights learned from the above examples, return the next price for {asset_symbol} using the following input:\n"
        f"</input> [Prices:[{History_O}], News:{news_current}]\n\n"
        f"Make sure to capture the effect of Prices and News for predicting the output\n"
        f"Provide your output as a single number (e.g., 38500.00). Do not include any text other than the number."
    )

    def run_simulation(_):
        try:
            response = ollama.chat(
                model="llama3.1",
                messages=[
                    {"role": "system", "content": "You are a financial forecasting assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            response_content = response["message"]["content"]
            match = re.search(r"[\$]?[0-9,]+\.?[0-9]*", response_content)
            if match:
                return float(match.group().replace(",", "").replace("$", ""))
        except Exception as e:
            print(f"LLaMA simulation failed: {e}")
        return np.nan

    max_workers = min(10, n_simulations)  # Limit number of threads
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(run_simulation, _) for _ in range(n_simulations)]
        for future in tqdm(
            as_completed(futures),
            total=n_simulations,
            desc=f"Simulating {n_simulations} predictions for {current_date.date()}"
        ):
            result = future.result()
            if result is not None:
                predictions.append(result)

    if len(predictions) == 0:
        print(f"No valid predictions generated for {current_date}.")
        return np.nan, np.nan, np.nan, np.nan

    # Calculate quantiles
    quantile_values = np.nanquantile(predictions, quantiles)
    return quantile_values[0], quantile_values[1], quantile_values[2], predictions

### Making predictions on a roling window from start_date to end_date

In [18]:
# Rolling Predictions with Probabilistic Forecasting including news
def rolling_window_price_predictions_probabilistic(
    asset_symbol,
    historical_prices,
    news_data,
    start_date,
    end_date,
    window_size,
    n_simulations
):
    historical_prices.index = pd.to_datetime(historical_prices.index).normalize().tz_localize(None)
    news_data.index = pd.to_datetime(news_data.index).normalize()
    # Adjust start_date to ensure enough data for the window
    earliest_date = historical_prices.index[0 + window_size - 1]      #+ timedelta(days=window_size - 1)
    print(earliest_date)
    start_date = max(start_date, earliest_date)
    current_date = start_date
    end_date = min(end_date, historical_prices.index[-2])  # Ensure end_date is within available data  # - timedelta(days=1)

    predictions = []
    raw_prediction_list = []
    iteration = 0

    while current_date <= end_date:
        current_date_norm = pd.to_datetime(current_date).normalize()

        # Calculate the start date of the window
        # window_start_date = current_date_norm - timedelta(days=window_size - 1)
        window_start_date = LastMarketOpen_DateDiff(historical_prices, current_date_norm, -(window_size - 1))

        # Check if the window is within the data range
        if window_start_date < historical_prices.index[0]:
            print(f"Not enough historical data to create a window for {current_date_norm.date()}. Skipping.")
            # current_date += timedelta(days=1)
            current_date = LastMarketOpen_DateDiff(historical_prices, current_date, 1)
            continue

        low, median, high, raw_predictions = predict_next_price_probabilistic(
            asset_symbol,
            current_date_norm,
            historical_prices,
            news_data,
            window_size,
            n_simulations
        )

        # Accumulating raw output for all simulations
        raw_prediction_list.append(raw_predictions)

        # Get the actual price for the next day (the day after current_date)
        # next_date = current_date_norm + timedelta(days=1)
        next_date = LastMarketOpen_DateDiff(historical_prices, current_date_norm, 1)
        if next_date in historical_prices.index:
            actual_price = historical_prices.loc[next_date, asset_symbol]
        else:
            print(f"No actual price available for {next_date.date()}. Skipping.")
            current_date = LastMarketOpen_DateDiff(historical_prices, current_date, 1)
            continue

        if median is not None:
            predictions.append(
                (next_date, low, median, high, actual_price)
            )
        else:
            print(f"No prediction available for {current_date_norm.date()}.")

        current_date = LastMarketOpen_DateDiff(historical_prices, current_date, 1)

        iteration += 1
        if iteration%5 == 0:
            # Saving pedictions after every 30 steps (days)
            predictions_df = pd.DataFrame(
                    predictions,
                    columns=["Date", "Low_Prediction", "Median_Prediction", "High_Prediction", "Actual_Price"]
                )
            predictions_df.to_csv('predictions_partial.csv', index=False)

            # Saving raw simulation output after every 30 steps (days)
            raw_output_df = pd.DataFrame(raw_prediction_list,
                                         columns = [f"out_{i}" for i in range(n_simulations)])
            raw_output_df["Date"] = predictions_df["Date"]
            raw_output_df.to_csv('raw_output_partial.csv', index=False)

    # Final prediction df.
    predictions_df = pd.DataFrame(
        predictions,
        columns=["Date", "Low_Prediction", "Median_Prediction", "High_Prediction", "Actual_Price"]
    )
    # Final raw output df
    raw_output_df = pd.DataFrame(raw_prediction_list,
                                 columns = [f"out_{i}" for i in range(n_simulations)])
    raw_output_df["Date"] = predictions_df["Date"]
    return predictions_df, raw_output_df

### Plotting real and predicted prices

In [21]:
# Plot Predictions and Metrics
def plot_predictions_and_compute_metrics_probabilistic(
    predictions_df,
    asset_symbol,
    save_path="predictions_chart.png"
):
    predictions_df = predictions_df.dropna(subset=["Median_Prediction", "Actual_Price"])
    actual_prices = predictions_df["Actual_Price"]
    median_predictions = predictions_df["Median_Prediction"]

    if predictions_df.empty or actual_prices.empty or median_predictions.empty:
        print("No predictions to plot or compute metrics.")
        return None, None

    rmse = np.sqrt(mean_squared_error(actual_prices, median_predictions))
    r2 = r2_score(actual_prices, median_predictions)

    plt.figure(figsize=(10, 6))
    plt.plot(
        predictions_df["Date"],
        actual_prices,
        label="Actual Prices",
        color="blue",
        linestyle='-'
    )
    plt.plot(
        predictions_df["Date"],
        median_predictions,
        label="Median Predictions",
        color="red",
        linestyle='--'
    )
    plt.fill_between(
        predictions_df["Date"],
        predictions_df["Low_Prediction"],
        predictions_df["High_Prediction"],
        color='gray',
        alpha=0.3,
        label="Prediction Range (1%-99%)"
    )
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.title("Probabilistic Forecasting: Predicted vs Actual Prices with News Influence")
    plt.legend()
    plt.grid(True)
    # plt.ylim([100,250])
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save the plot as a transparent PNG
    plt.savefig(save_path, transparent=True, dpi=300)
    print(f"Chart saved to {save_path} (transparent background).")

    plt.show()

    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Out-of-Sample R2: {r2:.4f}")
    return rmse, r2

### Execution
#### Assumptions:
- Complete data for all  market data is available from start_date to end_date

In [24]:
# assets = ['CBU','^GDAXI', '^GSPC', 'GOOG', 'XLE', '^STOXX', '^N225', '^BSESN', '^FTSE', 'TXGE', 'BTC-USD']
# news_query = ['Community Financial System Inc. stock news', 'DAX PERFORMANCE-INDEX news', 'S&P 500 news', 
#              'Google, Alphabet Inc. stock news', 'Energy Select Sector SPDR Fund news', 'STOXX Europe 600 news',
#              'Nikkei 225 news', 'FTSE 100 Index news', 'Texas Gulf Energy, Incorporated (TXGE) news', 'Bitcoin news']

assets = ['GOOG']
news_query = ['Google, Alphabet Inc. stock price']

In [26]:
# Test Execution
if __name__ == "__main__":
    for index, asset in enumerate(assets):
        asset_symbol = asset
        query = news_query[index]
        # start_date = pd.to_datetime(pd.read_csv('predictions_partial.csv')['Date'][-1])
        start_date = pd.to_datetime('2024-01-01')  # Adjusted to 2023 as future data may not be available
        end_date = pd.to_datetime('2024-11-30')
        print(start_date)
        print(end_date)
        window_size = 15
        n_simulations = 100  # Adjust this value as needed

        # Import or Fetch Data
        historical_prices, news_data = load_or_fetch_data(asset_symbol, query, start_date, end_date)

        # Rolling Predictions with Probabilistic Forecasting
        predictions_df, raw_output = rolling_window_price_predictions_probabilistic(
            asset_symbol,
            historical_prices,
            news_data,
            start_date,
            end_date,
            window_size,
            n_simulations
        )
        # Saving results to a file.
        predictions_df.to_csv(f'{asset_symbol}_predictions_full_p3.csv')
        raw_output.to_csv(f'{asset_symbol}_raw_output_p3.csv')

        # Plot and Evaluate Metrics
        rmse, r2 = plot_predictions_and_compute_metrics_probabilistic(predictions_df, asset_symbol, f"{asset_symbol}_predictions_chart_p3.png")

        if rmse is not None and r2 is not None:
            print("\nFinal Metrics:")
            print(f"RMSE: {rmse:.4f}")
            print(f"R2 Score: {r2:.4f}")

2024-01-01 00:00:00
2024-11-30 00:00:00
Loading existing historical prices...
Loading existing news data...
2024-01-23 00:00:00
Following examples shows past 10 days historic prices list, recent news related to GOOG as input and next price as their corresponding output:
Example 1: </input> [Prices:[139.06, 139.86, 137.55, 136.90, 140.03, 142.05, 143.29, 143.16, 143.72, 143.56], News:How To Invest In Magnificent Seven Stocks Like Nvidia In 2024 - Investor's Business Daily Sundar Pichai is taking a leaf out of Mark Zuckerberg’s book and warns Google staff ‘ambitious goals’ can only be met with job cuts - Fortune 2 Top Growth Stocks to Buy Hand Over Fist Before the Nasdaq Soars Higher in 2024 - The Motley Fool Google CEO says more job cuts are needed in 2024 in order to reach ‘ambitious goals’ - CNBC Apple Stock vs. Microsoft Stock: Which Is Better? - TheStreet], </output> 142.37841796875
Example 2: </input> [Prices:[139.86, 137.55, 136.90, 140.03, 142.05, 143.29, 143.16, 143.72, 143.56, 

Simulating 100 predictions for 2024-01-23:   0%|        | 0/100 [00:03<?, ?it/s]

KeyboardInterrupt



In [None]:
predictions_df.to_csv(f'{asset_symbol}_predictions_full_3.3.csv')

In [None]:
rmse, r2 = plot_predictions_and_compute_metrics_probabilistic(predictions_df, asset_symbol, f"{asset_symbol}_predictions_chart_3.3.png")

### Test to check if ollama is working.

In [None]:
response = ollama.chat(
    model="llama3.1",
    messages=[
        {"role": "system", "content": "You are a financial forecasting assistant."},
        {"role": "user", "content": "What is square of 2"}
    ]
)
response_content = response["message"]["content"]
match = re.search(r"[\$]?[0-9,]+\.?[0-9]*", response_content)
print(response_content)
if match:
    print (float(match.group().replace(",", "").replace("$", "")))

In [None]:
predictions_df.columns

In [None]:
correction_scaling = (predictions_df.iloc[:,1:].values>1000)*0.01 + (predictions_df.iloc[:,1:].values<1000)

In [None]:
correction_scaling

In [None]:
correct_values = predictions_df.iloc[:,1:].values * correction_scaling

In [None]:
correct_df = pd.DataFrame(data = correct_values, columns = predictions_df.columns[1:])
correct_df['Date'] = predictions_df['Date'].values

In [None]:
rmse, r2 = plot_predictions_and_compute_metrics_probabilistic(correct_df)

In [None]:
4%3