In [5]:
import os
import pandas as pd
import sys   
import numpy as np
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

sys.path.append('..')
from Workflow import Workflow

In [6]:
wf = Workflow()
Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest = wf.load_data()

In [7]:
def optimize_lasso_model(X, y, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, seed=42):
    # Create Lasso regression model
    estimator = Lasso(random_state=seed)  
    
    # Use GridSearchCV for hyperparameter optimization
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs,
        verbose=3 
    )
    
    # Fit the model to the data
    grid_search.fit(X, y)
    
    # Output the results
    print(f'Best parameters found: {grid_search.best_params_}')
    print(f'Best score: {grid_search.best_score_}')
    
    return grid_search.best_estimator_

In [9]:
# Define the parameter grid for Lasso regression
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'max_iter': [1000, 2000],   # Maximum number of iterations
    'tol': [1e-2, 1e-3,1e-4],         # Tolerance for stopping criteria
}

# Use the optimize_lasso_model function to find the best model
seed = 42
best_model = optimize_lasso_model(Xtrain, ytrain, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, seed=seed)

# Evaluate the best model on the training set
y_train_pred = best_model.predict(Xtrain)
train_r2 = metrics.r2_score(ytrain, y_train_pred)

# Evaluate the best model on the validation set
y_valid_pred = best_model.predict(Xvalid)
validation_rmse = np.sqrt(metrics.mean_squared_error(yvalid, y_valid_pred))
validation_r2 = metrics.r2_score(yvalid, y_valid_pred)

# Evaluate the best model on the test set
y_test_pred = best_model.predict(Xtest)
test_r2 = metrics.r2_score(ytest, y_test_pred)

# Print the results
print("Training R2 Score:", train_r2)
print("Validation RMSE:", validation_rmse)
print("Validation R2 Score:", validation_r2)
print("Test R2 Score:", test_r2)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters found: {'alpha': 0.01, 'max_iter': 1000, 'tol': 0.0001}
Best score: -5265.826469936537
Training R2 Score: 0.4777517976823974
Validation RMSE: 73.32045557892536
Validation R2 Score: 0.43844198442808646
Test R2 Score: 0.4739764343756313
