In [2]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [4]:

import os
import joblib
from datetime import datetime, timedelta
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import yfinance as yf
import random

In [5]:
def fetch_stock_data(ticker, train_start='2010-01-01', train_end='2020-12-31',
                     val_start='2021-01-01', val_end='2023-01-01'):
    """
    Fetch stock data for the given ticker and timeframes
    """
    if ticker is None:
        raise ValueError("Ticker symbol not provided.")

    data = yf.download(ticker, start=train_start, end=val_end)
    data = data.asfreq('B')
    data = data.ffill().bfill()
    training_data = data['Close'][train_start:train_end]
    validation_data = data['Close'][val_start:val_end]

    return ticker, data, training_data, validation_data

def build_and_train_model(training_data, validation_data):
    """
    Build and train an ARIMA model on the provided training data and validate it
    """
    print("Finding best ARIMA parameters with auto_arima")
    model = auto_arima(training_data, seasonal=False, trace=True,
                       error_action='ignore', suppress_warnings=True)

    predictions = model.predict(n_periods=len(validation_data))
    mse = mean_squared_error(validation_data, predictions)
    rmse = np.sqrt(mse)
    print(f"Validation RMSE: {rmse:.2f}")

    return model, predictions

def update_model_with_all_data(model, data):
    """
    Update the model with all available data
    """
    all_data = data['Close'].asfreq('B').ffill().bfill()
    model.update(all_data)
    return model

def predict_next_12_months(model, data, ticker):
    """
    Predict stock prices for the next 12 months
    """
    if model is None or data is None:
        raise ValueError("Model has not been trained or data is not loaded")

    last_date = data.index[-1]
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                                 periods=252, freq='B')
    predictions = model.predict(n_periods=252)

    # Introduce a small bias to integrate volatility
    for i in range(1, len(predictions)):
        if predictions[i] > predictions[i - 1]:
            predictions[i] *= 1.01
        else:
            predictions[i] *= 0.99

    future_predictions = pd.Series(predictions, index=future_dates)

    monthly_indices = pd.date_range(start=future_dates.min(), end=future_dates.max(), freq='ME')
    monthly_predictions = [future_predictions.loc[future_predictions.index[
        future_predictions.index.get_indexer([date], method='nearest')[0]]] for date in monthly_indices]

    return monthly_predictions, monthly_indices

def predict_next_5_years(model, data, ticker):
    """
    Predict stock prices for the next 5 years
    """
    if model is None or data is None:
        raise ValueError("Model has not been trained or data is not loaded")

    last_date = data.index[-1]
    # 252 business days / year
    future_periods = 252 * 5
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                                 periods=future_periods, freq='B')
    predictions = model.predict(n_periods=future_periods)

    # introduce a small bias to integrate volatility
    for i in range(1, len(predictions)):
        if predictions[i] > predictions[i - 1]:
            predictions[i] *= 1.001
        else:
            predictions[i] *= 0.999

    future_predictions = pd.Series(predictions, index=future_dates)

    # Generate monthly indices over the 5-year period
    monthly_indices = pd.date_range(start=future_dates.min(), end=future_dates.max(), freq='M')
    monthly_predictions = [future_predictions.loc[future_predictions.index[
        future_predictions.index.get_indexer([date], method='nearest')[0]]] for date in monthly_indices]

    return monthly_predictions, monthly_indices

def print_monthly_predictions(predictions, future_dates, ticker):
    """
    Print the monthly predictions in a formatted table
    """
    print(f"\n{ticker} Monthly Price Predictions:")
    print("-" * 40)
    print(f"{'Month':<15} {'Predicted Price':<15}")
    print("-" * 40)
    for i in range(len(predictions)):
        month_str = future_dates[i].strftime('%b %Y')
        price_str = f"${predictions[i]:.2f}"
        print(f"{month_str:<15} {price_str:<15}")

def adjust_predictions_with_sentiment(predictions, weighted_sentiment_score):
    """
    Adjust price predictions based on sentiment analysis
    """
    adjusted_predictions = []
    score_intervals = {
        (-1, -0.75): (0.15, 0.2),
        (-0.75, -0.2): (0.1, 0.15),
        (-0.2, 0.2): (0.02, 0.1),
        (0.2, 0.75): (0.1, 0.15),
        (0.75, 1): (0.15, 0.2)
    }
    initial_impact_factor = 0.01

    for (low, high), impact in score_intervals.items():
        if low <= weighted_sentiment_score <= high:
            initial_impact_factor = random.uniform(impact[0], impact[1])

    for i, prediction in enumerate(predictions):
        # use exponential decay for a gradual decrease of the impact over the months
        gradual_impact = initial_impact_factor * (0.6 ** i)
        adjusted_prediction = prediction * (1 + weighted_sentiment_score * gradual_impact)
        adjusted_predictions.append(adjusted_prediction)

    return adjusted_predictions

def run_stock_prediction(ticker, data, training_data, validation_data):
    """
    Run the complete stock prediction pipeline for a given ticker
    """
    print(f"Processing {ticker} stock prediction")

    print("Training ARIMA model")
    model, val_predictions = build_and_train_model(training_data, validation_data)

    model = update_model_with_all_data(model, data)

    print("Predicting prices for the next 12 months...")
    predictions, future_dates = predict_next_12_months(model, data, ticker)

    return predictions, future_dates

def run_stock_prediction_with_sentiment(ticker, weighted_sentiment_score):
    """
    Run the complete stock prediction pipeline with sentiment adjustment
    """
    ticker, data, training_data, validation_data = fetch_stock_data(ticker)

    model, _ = build_and_train_model(training_data, validation_data)

    model = update_model_with_all_data(model, data)

    predictions, future_dates = predict_next_12_months(model, data, ticker)

    adjusted_predictions = adjust_predictions_with_sentiment(predictions, weighted_sentiment_score)

    return adjusted_predictions, future_dates


In [14]:
print("Training on S&P 500 data...")
ticker_sp500 = "^GSPC"
_, sp500_data, sp500_training_data, sp500_validation_data = fetch_stock_data(ticker_sp500)

[*********************100%***********************]  1 of 1 completed

Training on S&P 500 data...





In [15]:
print("Training on S&P 500 data...")
ticker_sp500 = "^GSPC"
_, sp500_data, sp500_training_data, sp500_validation_data = fetch_stock_data(ticker_sp500)

sp500_model, sp500_val_predictions = build_and_train_model(sp500_training_data, sp500_validation_data)

sp500_model = update_model_with_all_data(sp500_model, sp500_data)

print("S&P 500 model has been successfully trained")

[*********************100%***********************]  1 of 1 completed

Training on S&P 500 data...
Finding best ARIMA parameters with auto_arima
Performing stepwise search to minimize aic





 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=26323.885, Time=3.20 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=26450.647, Time=0.11 sec




 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=26360.584, Time=0.19 sec




 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=26378.313, Time=0.66 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=26452.697, Time=0.10 sec




 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=26325.076, Time=1.57 sec




 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=26332.011, Time=1.39 sec




 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=26210.214, Time=5.49 sec




 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=26252.462, Time=3.35 sec




 ARIMA(4,1,2)(0,0,0)[0] intercept   : AIC=26246.580, Time=7.79 sec




 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=26311.611, Time=6.56 sec




 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=26317.937, Time=5.92 sec




 ARIMA(4,1,1)(0,0,0)[0] intercept   : AIC=26243.377, Time=5.34 sec




 ARIMA(4,1,3)(0,0,0)[0] intercept   : AIC=26213.972, Time=8.71 sec




 ARIMA(3,1,2)(0,0,0)[0]             : AIC=26213.186, Time=1.34 sec

Best model:  ARIMA(3,1,2)(0,0,0)[0] intercept
Total fit time: 51.732 seconds
Validation RMSE: 405.52




S&P 500 model has been successfully trained


In [16]:
prediction_ticker = "AAPL"

_, ticker_data, _, _ = fetch_stock_data(prediction_ticker)

print(f"\nGenerating predictions for {prediction_ticker} using S&P 500 model...")
predictions, dates = predict_next_12_months(sp500_model, ticker_data, prediction_ticker)

print(f"\nPredicted prices for {prediction_ticker}:")
for i in range(len(predictions)):
    print(f"price {predictions[i]:.2f} at date {dates[i]}")

[*********************100%***********************]  1 of 1 completed


Generating predictions for AAPL using S&P 500 model...

Predicted prices for AAPL:
price 3887.67 at date 2023-01-31 00:00:00
price 3896.05 at date 2023-02-28 00:00:00
price 3828.82 at date 2023-03-31 00:00:00
price 3915.34 at date 2023-04-30 00:00:00
price 3924.94 at date 2023-05-31 00:00:00
price 3934.53 at date 2023-06-30 00:00:00
price 3865.60 at date 2023-07-31 00:00:00
price 3953.72 at date 2023-08-31 00:00:00
price 3884.41 at date 2023-09-30 00:00:00
price 3893.82 at date 2023-10-31 00:00:00
price 3903.22 at date 2023-11-30 00:00:00





In [17]:
sentiment_score = 0.5
print(f"\nAdjusting {prediction_ticker} predictions with sentiment score {sentiment_score}...")
adj_predictions = adjust_predictions_with_sentiment(predictions, sentiment_score)

print(f"\nSentiment-adjusted prices for {prediction_ticker}:")
for i in range(len(adj_predictions)):
    print(f"price {adj_predictions[i]:.2f} at date {dates[i]}")


Adjusting AAPL predictions with sentiment score 0.5...

Sentiment-adjusted prices for AAPL:
price 4178.22 at date 2023-01-31 00:00:00
price 4070.75 at date 2023-02-28 00:00:00
price 3931.83 at date 2023-03-31 00:00:00
price 3978.54 at date 2023-04-30 00:00:00
price 3962.95 at date 2023-05-31 00:00:00
price 3957.40 at date 2023-06-30 00:00:00
price 3879.08 at date 2023-07-31 00:00:00
price 3962.00 at date 2023-08-31 00:00:00
price 3889.29 at date 2023-09-30 00:00:00
price 3896.75 at date 2023-10-31 00:00:00
price 3904.99 at date 2023-11-30 00:00:00
