In [2]:
# Installing the 'yfinance' library, which allowed downloading financial data from Yahoo Finance
# Installing the 'pandas' library, which is used for data manipulation and analysis

!pip install yfinance 
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
import yfinance as yf
import pandas as pd

# Fetch closing prices for a single stock
def get_price(tick,start='2022-10-01',end=None):
    return yf.Ticker(tick).history(start=start,end=end)['Close']


# Fetch closing prices for multiple stocks
def get_prices(tickers,start='2022-10-01',end=None):
    df=pd.DataFrame()
    for s in tickers:
        df[s]=get_price(s,start,end)
    return df

In [6]:
#Prepare training and testing data sets
feature_stocks=['tsla','meta','goog','amzn','nflx','gbtc','gdx','intc','dal','c']
predict_stock='msft'

# training set
start_date_train='2023-1-01'
end_date_train='2024-6-30'

X_train=get_prices(feature_stocks,start=start_date_train,end=end_date_train)
y_train=get_prices([predict_stock],start=start_date_train,end=end_date_train)

# testing set
start_date_test='2024-11-01'
end_date_test='2024-12-31'
X_test=get_prices(feature_stocks,start=start_date_test,end=end_date_test)
y_test=get_prices([predict_stock],start=start_date_test,end=end_date_test)

In [8]:
X_train

Unnamed: 0_level_0,tsla,meta,goog,amzn,nflx,gbtc,gdx,intc,dal,c
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-03 00:00:00-05:00,108.099998,124.265305,89.378853,85.820000,294.950012,8.200000,28.842234,25.775145,32.105251,41.898056
2023-01-04 00:00:00-05:00,113.639999,126.885315,88.392395,85.139999,309.410004,8.380000,30.067492,26.691208,33.857693,42.977997
2023-01-05 00:00:00-05:00,110.339996,126.456940,86.459343,83.120003,309.700012,8.450000,29.804935,26.575495,34.684692,42.785805
2023-01-06 00:00:00-05:00,113.059998,129.525238,87.844376,86.080002,315.549988,8.650000,30.689848,27.703697,35.472309,43.298321
2023-01-09 00:00:00-05:00,119.769997,128.977325,88.482086,87.360001,315.170013,9.650000,30.398119,28.262980,36.200855,43.508820
...,...,...,...,...,...,...,...,...,...,...
2024-06-24 00:00:00-04:00,182.580002,498.032776,180.347717,185.570007,669.020020,52.610001,33.852570,30.377298,49.083549,59.807236
2024-06-25 00:00:00-04:00,187.350006,509.702240,185.126007,186.339996,672.409973,55.020000,33.447327,30.546227,48.497326,60.041241
2024-06-26 00:00:00-04:00,196.369995,512.217773,184.916504,193.610001,677.690002,54.130001,33.427559,30.347488,47.871361,59.719482
2024-06-27 00:00:00-04:00,197.419998,518.646423,186.402878,197.850006,684.340027,54.520000,33.832802,30.397173,48.288670,60.011986


In [10]:
y_train

Unnamed: 0_level_0,msft
Date,Unnamed: 1_level_1
2023-01-03 00:00:00-05:00,235.240021
2023-01-04 00:00:00-05:00,224.949875
2023-01-05 00:00:00-05:00,218.282867
2023-01-06 00:00:00-05:00,220.855423
2023-01-09 00:00:00-05:00,223.005722
...,...
2024-06-24 00:00:00-04:00,445.079468
2024-06-25 00:00:00-04:00,448.340454
2024-06-26 00:00:00-04:00,449.543457
2024-06-27 00:00:00-04:00,450.229492


In [12]:
#Convert training and testing data into numpy array
import numpy as np

X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, 1)  
X_test = np.array(X_test)
y_test = np.array(y_test).reshape(-1, 1)  

In [14]:
#Append a dummy feature to both X_train and X_test
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

In [None]:
XTX = X_train.T @ X_train
XTy = X_train.T @ y_train
w = np.linalg.inv(XTX) @ XTy

print("\n Best Weights Computed Using Normal Equation:\n", w)

y_pred_train = X_train @ w
y_pred_test = X_test @ w

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

#Evaluate the Model and Report Errors
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\n Model Evaluation Results (Normal Equation):")
print(f"Train R² Score: {train_r2:.4f}")
print(f"Train MSE: {train_mse:.4f}")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")

print("\n Training and Testing Errors Report:")
print(f"\n Mean Squared Error (Training): {train_mse:.4f}")
print(f" Mean Squared Error (Testing): {test_mse:.4f}")

#Visualize Actual vs Predicted Prices
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='Actual MSFT Price', marker='o')
plt.plot(y_pred_test, label='Predicted MSFT Price (Normal Equation)', linestyle='--', marker='x')
plt.title('Actual vs Predicted MSFT Stock Price (Normal Equation)')
plt.xlabel('Observation Index')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid()
plt.show()

#Visualize Residuals
residuals = y_test.flatten() - y_pred_test.flatten()
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True)
plt.title('Residuals Distribution (Normal Equation)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

#Feature Coefficients Analysis
coefficients = pd.DataFrame({
    'Feature': feature_stocks + ['Dummy (Bias Term)'],
    'Coefficient': w.flatten()
}).sort_values(by='Coefficient', ascending=False)

print("\n Feature Coefficients (Normal Equation):\n", coefficients)