# Predicting prices with a single-asset regression model

## Preparing the independent and target variables

In [158]:
# This command will fix the corrupted numpy in your active environment
!conda install --force-reinstall -c conda-forge numpy=1.26.4 --yes

Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Lenovo\anaconda3

  added / updated specs:
    - numpy=1.26.4




Downloading and Extracting Packages: ...working... done
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [207]:
# --- 1. SETUP AND DATA DOWNLOAD (Corrected Date Range) ---
import yfinance as yf
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Define all tickers needed
TICKERS = ['AAPL', 'JPM', 'GS', 'SPY', 'GLD', 'UUP', 'IEF']

# FIX: Download data up to the end of 2019 to include the test period
df_data = yf.download(
    TICKERS,
    start='2010-01-01',
    end='2019-12-31', # Corrected end date
    progress=False,
    auto_adjust=True
)['Close']

# --- 2. DATA PREPARATION ---
# Create the necessary DataFrames
df_gs = df_data[['GS']]
df_jpm = df_data[['JPM']]

df_x = pd.DataFrame({'GS': df_gs['GS']})
jpm_prices = df_jpm['JPM']

# --- 3. MODEL TRAINING ---
# Assuming the 'LinearRegressionModel' class is correctly defined in a previous cell
linear_reg_model = LinearRegressionModel()
linear_reg_model.learn(df_x, jpm_prices, start_date='2018',
                       end_date='2019', lookback_period=20)

# --- 4. PREDICTION AND ERROR CALCULATION ---
# Get the dates from your test period
all_dates = df_x.index
start_date_ts = pd.to_datetime('2018-01-01')
end_date_ts = pd.to_datetime('2019-12-31')
# Ensure we only predict for dates where a model was trained
dates = [d for d in all_dates if d >= start_date_ts and d <= end_date_ts and d in linear_reg_model.models]

# Make predictions
predictions = [linear_reg_model.predict(df_x, a_date) for a_date in dates]

# Create the results DataFrame
df_result = pd.DataFrame({
    'prediction': predictions,
    'actual': jpm_prices[dates]
})

# Calculate and print the Mean Absolute Error
actual = df_result['actual']
predicted = df_result['prediction']

mae = mean_absolute_error(actual, predicted)
print(f'Mean Absolute Error: {mae:.4f}')

Training starting...
Training completed.
Mean Absolute Error: 1.2174


#### Mean squared error (MSE) as a risk metric

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(actual, predicted)
print('mean squared error:', mse)

### Explained variance score as a risk metric

In [None]:
from sklearn.metrics import explained_variance_score
eva = explained_variance_score(actual, predicted)
print('explained variance score:', eva)

### R<sup>2</sup> as a risk metric

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(actual, predicted) 
print('r2 score:', r2)

## Ridge regression

In [None]:
from sklearn.linear_model import Ridge

# FIX: A corrected version of the RidgeRegressionModel class to remove all warnings
class RidgeRegressionModel(LinearRegressionModel):

    def _train(self, x, y):
        """
        Trains a Ridge regression model.
        """
        model = Ridge()
        # FIX: Use .to_numpy() instead of the deprecated .ravel()
        model.fit(x, y.to_numpy())
        return model

In [None]:
ridge_reg_model = RidgeRegressionModel()
ridge_reg_model.learn(df_x, jpm_prices, start_date='2018', 
                      end_date='2019', lookback_period=20)

In [None]:
from sklearn.metrics import (
    accuracy_score, mean_absolute_error, 
    explained_variance_score, r2_score
)
def print_regression_metrics(df_result):
    actual = list(df_result['Actual'])
    predicted = list(df_result['Predicted'])
    print('mean_absolute_error:', 
          mean_absolute_error(actual, predicted))
    print('mean_squared_error:', mean_squared_error(actual, predicted))
    print('explained_variance_score:', 
        explained_variance_score(actual, predicted))
    print('r2_score:', r2_score(actual, predicted))    

In [None]:
print_regression_metrics(ridge_reg_model.df_result)

# Predicting returns with a cross-asset momentum model

## Preparing the independent variables

In [None]:
import yfinance as yf
import pandas as pd

# FIX: Define all tickers and download them in a single call using yfinance
TICKERS = ['JPM', 'GS', 'SPY', 'GLD', 'UUP', 'IEF']

# Download the closing price data
df_data = yf.download(
    TICKERS,
    start='2010-01-01',
    end='2019-12-31', # Use a consistent date range for all data
    progress=False,
    auto_adjust=True
)['Close']

# --- Create the individual DataFrames for the rest of the notebook ---

df_jpm = df_data[['JPM']].rename(columns={'JPM': '4. close'})
df_gs = df_data[['GS']].rename(columns={'GS': '4. close'})
df_spx = df_data[['SPY']].rename(columns={'SPY': '4. close'})
df_gld = df_data[['GLD']].rename(columns={'GLD': '4. close'})
df_uup = df_data[['UUP']].rename(columns={'UUP': '4. close'})
df_ief = df_data[['IEF']].rename(columns={'IEF': '4. close'})

print("All data downloaded successfully.")
df_jpm.head() # Display a sample to confirm

In [None]:
df_input = pd.DataFrame({
    'SPX_1m': df_spx['4. close'].pct_change(21),
    'GLD_1m': df_gld['4. close'].pct_change(21),
    'UUP_1m': df_uup['4. close'].pct_change(21),
    'IEF_1m': df_ief['4. close'].pct_change(21),
    'SPX_3m': df_spx['4. close'].pct_change(63),
    'GLD_3m': df_gld['4. close'].pct_change(63),
    'UUP_3m': df_uup['4. close'].pct_change(63),
    'IEF_3m': df_ief['4. close'].pct_change(63)
}).dropna()
y_direction = (df_jpm['4. close'].pct_change() > 0).astype(bool)

In [None]:
# FIX: Install and import the yfinance library
!pip install yfinance

import yfinance as yf
import pandas as pd

# FIX: Define all tickers and download them in a single, efficient call
TICKERS = ['JPM', 'GS', 'SPY', 'GLD', 'UUP', 'IEF']

# Download the closing price data for all tickers at once
df_data = yf.download(
    TICKERS,
    start='2010-01-01',
    end='2019-12-31', # Use a consistent date range
    progress=False,
    auto_adjust=True
)['Close']

# --- Create the individual DataFrames the notebook expects ---
# This ensures the rest of your code will work without changes.

df_jpm = df_data[['JPM']].rename(columns={'JPM': '4. close'})
df_gs  = df_data[['GS']].rename(columns={'GS': '4. close'})
df_spx = df_data[['SPY']].rename(columns={'SPY': '4. close'})
df_gld = df_data[['GLD']].rename(columns={'GLD': '4. close'})
df_uup = df_data[['UUP']].rename(columns={'UUP': '4. close'})
df_ief = df_data[['IEF']].rename(columns={'IEF': '4. close'})

print("All required stock and ETF data has been downloaded successfully.")
df_jpm.head() # Display a sample to confirm it worked

In [None]:
df_lagged = df_assets_1m.join(df_assets_3m)\
    .join(df_assets_6m)\
    .join(df_assets_12m)\
    .dropna()

In [None]:
df_lagged.info()

## Preparing the target variables

In [None]:
y = jpm_prices.pct_change().dropna()

In [None]:
multi_linear_model = LinearRegressionModel()
multi_linear_model.learn(df_lagged, y, start_date='2018', 
                         end_date='2019', lookback_period=10)

In [None]:
multi_linear_model.df_result.plot(
    title='JPM actual versus predicted percentage returns',
    style=['-', '--'], figsize=(12,8));

In [None]:
print_regression_metrics(multi_linear_model.df_result)

## An ensemble of decision trees

### Bagging regressor

In [None]:
from sklearn.ensemble import BaggingRegressor

class BaggingRegressorModel(LinearRegressionModel):
    def get_model(self):
        return BaggingRegressor(n_estimators=20, random_state=0)  

In [None]:
bagging = BaggingRegressorModel()
bagging.learn(df_lagged, y, start_date='2018', 
              end_date='2019', lookback_period=10)

In [None]:
print_regression_metrics(bagging.df_result)

# Predicting trends with classification-based machine learning

## Preparing the target variables

In [None]:
import numpy as np
y_direction = y >= 0

In [None]:
y_direction.head(3)

In [None]:
flags = list(y_direction.unique())
flags.sort()

In [None]:
print(flags)

## Preparing the dataset of multiple assets as input variables

In [None]:
df_input = df_assets_1m.join(df_assets_3m).dropna()

In [None]:
df_input.info()

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

class LogisticRegressionModel(LinearRegressionModel):
    def get_model(self):
        return LogisticRegression(solver='lbfgs')

In [None]:
logistic_reg_model = LogisticRegressionModel()
logistic_reg_model.learn(df_input, y_direction, start_date='2018', 
                         end_date='2019', lookback_period=100)

In [None]:
logistic_reg_model.df_result.head()

### Risk metrics for measuring classification-based predictions

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

df_result = logistic_reg_model.df_result    
actual = list(df_result['Actual'])
predicted = list(df_result['Predicted'])

matrix = confusion_matrix(actual, predicted)

In [None]:
print(matrix)

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

plt.subplots(figsize=(12,8))
sns.heatmap(matrix.T, square=True, annot=True, fmt='d', cbar=False, 
            xticklabels=flags, yticklabels=flags)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('JPM percentage returns 2018');

### Accuracy score

In [None]:
from sklearn.metrics import accuracy_score
print('accuracy_score:', accuracy_score(actual, predicted))

### Precision score

In [None]:
from sklearn.metrics import precision_score
print('precision_score:', precision_score(actual, predicted))

### Recall score

In [None]:
from sklearn.metrics import recall_score
print('recall_score:', recall_score(actual, predicted))

### F1 Score

In [None]:
from sklearn.metrics import f1_score
print('f1_score:', f1_score(actual, predicted))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC

class SVCModel(LogisticRegressionModel):
    def get_model(self):
        return SVC(C=1000, gamma='auto')

In [None]:
svc_model = SVCModel()
svc_model.learn(df_input, y_direction, start_date='2018', 
                end_date='2019', lookback_period=100)

In [None]:
df_result = svc_model.df_result
actual = list(df_result['Actual'])
predicted = list(df_result['Predicted'])

print('accuracy_score:', accuracy_score(actual, predicted))
print('precision_score:', precision_score(actual, predicted))
print('recall_score:', recall_score(actual, predicted))
print('f1_score:', f1_score(actual, predicted))    