In [106]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [107]:
df = pd.read_csv('preprocessed_data.csv')

In [108]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,Mkt-RF,SMB,...,DEXUSUK,DCOILWTICO,VIXCLS,DEXJPUS,DEXUSEU,T5YIE,OBMMIJUMBO30YF,NIKKEI225,BAMLH0A0HYM2EY,DCOILBRENTEU
0,2014-11-11,27.857022,28.05553,27.743303,27.99518,21966000.0,0.0,0.0,0.1,0.0,...,,77.85,12.92,,,,,17124.11,6.05,
1,2014-11-12,27.988198,27.993185,27.706394,27.843555,21882000.0,0.0,0.0,0.07,0.71,...,1.5815,77.16,13.02,115.42,1.2456,1.53,,17197.05,6.05,
2,2014-11-13,27.971239,27.971738,27.636069,27.753279,30220000.0,0.0,0.0,-0.03,-0.9,...,1.5717,74.13,13.79,115.63,1.2476,1.48,,17392.79,6.07,
3,2014-11-14,27.831085,27.831085,27.557263,27.690933,28502000.0,0.0,0.0,0.06,-0.15,...,1.5657,75.91,13.31,116.52,1.2494,1.45,,17490.83,6.11,
4,2014-11-15,,,,,,,,,,...,,,,,,,,,,


In [109]:
# Data cleaning: Replace specified characters with NaN and drop remaining NaN rows

df1=df.copy()
df1.dropna(inplace=True)


In [110]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



# Define the data preparation function
def prepare_data(data, features, target, test_size=0.2):
    X = df1.drop(['date', 'close'], axis=1)
    y = df1['close']
    
    split_index = int(len(X) * (1 - test_size))
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
    
    return X_train, X_test, y_train, y_test

# Define model training and evaluation function with hyperparameter tuning and imputation
def train_evaluate_model(model, X_train, y_train, tscv, param_grid=None):
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle NaNs in the dataset
        ('model', model)
    ])
    
    if param_grid:
        search = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='neg_mean_squared_error')
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_
        print(f"Best Parameters: {best_params}")
    else:
        pipeline.fit(X_train, y_train)
        best_model = pipeline
    
    # Cross-validation for evaluation metrics
    mse_scores, mae_scores = [], []
    for train_idx, val_idx in tscv.split(X_train):
        X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        best_model.fit(X_cv_train, y_cv_train)
        y_pred = best_model.predict(X_cv_val)
        
        mse_scores.append(mean_squared_error(y_cv_val, y_pred))
        mae_scores.append(mean_absolute_error(y_cv_val, y_pred))
    
    results = {
        'mean_mse': np.mean(mse_scores),
        'mean_mae': np.mean(mae_scores)
    }
    print("Cross-Validation MSE:", results['mean_mse'])
    print("Cross-Validation MAE:", results['mean_mae'])
    
    return best_model, results

# Main function for the full pipeline
def time_series_regression_pipeline(data, test_size=0.2):
    features = [col for col in data.columns if col not in ['date', 'close']]
    target = 'close'
    
    X_train, X_test, y_train, y_test = prepare_data(data, features, target, test_size)
    
    tscv = TimeSeriesSplit(n_splits=5)
    
    models = {
        'Ridge': (Ridge(), {'model__alpha': [0.1, 1.0, 10.0, 100.0]}),
        'Lasso': (Lasso(), {'model__alpha': [0.1, 1.0, 10.0, 100.0]}),
        'ElasticNet': (ElasticNet(), {'model__alpha': [0.1, 1.0, 10.0, 100.0], 'model__l1_ratio': [0.1, 0.5, 0.9]})
    }
    
    final_models = {}
    for model_name, (model, param_grid) in models.items():
        print(f"Training {model_name} model...")
        best_model, results = train_evaluate_model(model, X_train, y_train, tscv, param_grid)
        final_models[model_name] = best_model
    
    # Evaluate final models on test set
    for model_name, model in final_models.items():
        y_pred_test = model.predict(X_test)
        test_mse = mean_squared_error(y_test, y_pred_test)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        print(f"{model_name} Test MSE: {test_mse}, Test MAE: {test_mae}")
    
    return final_models

# Execute the pipeline
final_models = time_series_regression_pipeline(df1)


Training Ridge model...
Best Parameters: {'model__alpha': 10.0}
Cross-Validation MSE: 1.2931941273001872
Cross-Validation MAE: 0.887297833354625
Training Lasso model...


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  m

Best Parameters: {'model__alpha': 0.1}
Cross-Validation MSE: 1.7805227969790476
Cross-Validation MAE: 0.9922527653708493
Training ElasticNet model...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best Parameters: {'model__alpha': 0.1, 'model__l1_ratio': 0.5}
Cross-Validation MSE: 1.316887832677844
Cross-Validation MAE: 0.8688295205259084
Ridge Test MSE: 4.465432321165408, Test MAE: 1.8512407767502423
Lasso Test MSE: 1.174618761879587, Test MAE: 0.8625514902176807
ElasticNet Test MSE: 1.2072319719835876, Test MAE: 0.8818861914655421


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
