In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [3]:
# Define a new list of stock tickers
tickers = ['BA', 'NKE', 'DIS', 'ORCL', 'CSCO', 'PEP', 'VZ', 'PFE', 'MRK', 'KO']

In [4]:
# Initialize an empty DataFrame to store all stock data
all_data = pd.DataFrame()

In [5]:
# Fetch historical stock data for each ticker
for ticker in tickers:
    data = yf.download(ticker, start="2015-01-01", end="2023-01-01")
    data['Ticker'] = ticker
    all_data = pd.concat([all_data, data])

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [6]:
# Reset index to organize the data
all_data.reset_index(inplace=True)

In [7]:
# Feature engineering
all_data['Day'] = all_data['Date'].dt.day
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year

In [8]:
# Drop any rows with missing data
all_data.dropna(inplace=True)

In [9]:
# Define features and target variable
X = all_data[['Open', 'High', 'Low', 'Volume', 'Day', 'Month', 'Year']]
y = all_data['Close']

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
# Initialize models with default parameters
models = {
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'KNN Regressor': KNeighborsRegressor()
}

In [13]:
# Hyperparameter tuning for each model
param_grid_svr = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto']
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gbr = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [13]:
# Tuning the SVR model
grid_search_svr = GridSearchCV(SVR(), param_grid_svr, cv=5, scoring='r2', verbose=2)
grid_search_svr.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.4s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.8s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  15.3s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  17.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  15.9s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  16.4s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  16.9s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  17.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  29.2s
[CV] END .....................C=0.1, gamma=scal

In [14]:
# Tuning the Random Forest Regressor
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='r2', verbose=2)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  17.6s
[CV] END max_depth=5, min_samples_leaf=1, m

In [15]:
# Tuning the Gradient Boosting Regressor
grid_search_gbr = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gbr, cv=5, scoring='r2', verbose=2)
grid_search_gbr.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  13.8s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  13.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; 

In [16]:
# Tuning the KNN Regressor
grid_search_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv=5, scoring='r2', verbose=2)
grid_search_knn.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=5, we

In [18]:
# Evaluate the best models on the test set
models = {
   # 'Tuned SVR' : grid_search_svr.best_estimator_,
    'Tuned Random Forest': grid_search_rf.best_estimator_,
    'Tuned Gradient Boosting': grid_search_gbr.best_estimator_,
    'Tuned KNN Regressor': grid_search_knn.best_estimator_
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - Final MSE: {mse:.4f}, Final R2 Score: {r2:.4f}")

Tuned Random Forest - Final MSE: 0.7605, Final R2 Score: 0.9998
Tuned Gradient Boosting - Final MSE: 0.6676, Final R2 Score: 0.9998
Tuned KNN Regressor - Final MSE: 11.6004, Final R2 Score: 0.9969


In [19]:
# Accuracy calculation for the best model
best_model = max(models, key=lambda name: r2_score(y_test, models[name].predict(X_test)))
best_accuracy = r2_score(y_test, models[best_model].predict(X_test)) * 100
print(f"Final accuracy of the best model ({best_model}): {best_accuracy:.2f}%")

Final accuracy of the best model (Tuned Gradient Boosting): 99.98%


In [20]:
# Example: Predicting the price for a new date using the best model
new_data = pd.DataFrame({
    'Open': [150], 
    'High': [155], 
    'Low': [148], 
    'Volume': [10000000],
    'Day': [12],
    'Month': [8],
    'Year': [2024]
})

In [21]:
# Standardize the new data
new_data_scaled = scaler.transform(new_data)

In [22]:
# Predict the price using the best model
predicted_price = models[best_model].predict(new_data_scaled)
print(f"Predicted price for new data using {best_model}: ${predicted_price[0]:.2f}")

Predicted price for new data using Tuned Gradient Boosting: $152.20
