In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline
import yfinance as yf

import warnings
warnings.filterwarnings("ignore")

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression

In this notebook, you will build a SKLearn linear regression model to predict Forex futures ("Adj Close") returns with lagged Forex futures returns.


In [None]:
# Futures contract on the Foreign Exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration

name = (input("Enter ticker of your choice").upper() + '=X')
ticker = name.strip('=X')
df = yf.download(name)
df

In [None]:
# Trim the dataset to begin on January 1st, 2000
df = df.loc["2000-01-01":, :]
df.head()

# Data Preparation
**Returns**

In [None]:
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
df['Return'] = (df[["Adj Close"]].pct_change() * 100)
returns = df.replace(-np.inf, np.nan).dropna()
returns.tail()

# Lagged Returns

In [None]:
# Create a lagged return using the shift function
df['Lagged_Return'] = df["Return"].shift()
df = df.dropna()
df.tail()

# Train Test Split

In [None]:
# Create a train/test split for the data using 2018- current for testing and the rest for training
train = df[:'2017']
test = df['2018':]

In [None]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
X_train = train["Lagged_Return"].to_frame()
X_test = test["Lagged_Return"].to_frame()
y_train = train["Return"]
y_test = test["Return"]

In [None]:
X_train

# Linear Regression Model

In [None]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

In [None]:
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)

In [None]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Predicted Return"] = predictions

In [None]:
import matplotlib.pyplot as plt
# Plot the first 20 predictions vs the true values
prediction_plot=Results[:20].plot(subplots=True)
plt.savefig("Prediction_plot.png")

# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)


In [None]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
mse = mean_squared_error(
    Results["Return"],
    Results["Predicted Return"]
)

# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
rmse = np.sqrt(mse)
print(f"Out-of-Sample Root Mean Squared Error (RMSE): {rmse}")



# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)


In [None]:
# Construct a dataframe using just the "y" training data:
in_sample_results = y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
in_sample_results["In-sample Predictions"] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(
    in_sample_results["Return"],
    in_sample_results["In-sample Predictions"]
)

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f"In-sample Root Mean Squared Error (RMSE): {in_sample_rmse}")