In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor

# Load Datasets

In [2]:
def load_data(file_path):
    A = np.loadtxt(file_path)
    X = A[:, :9]    # Input features
    y = A[:, 9:]    # Output labels
    return X, y

In [3]:
X_multi, y_multi = load_data('data/tictac_multi.txt')

# Linear Regression

In [5]:
def finetune_parameters(X_train, y_train):
    model = LinearRegression()
    multioutput_regressor = MultiOutputRegressor(model)

    # Define the parameter grid
    param_grid = {'estimator__fit_intercept': [True, False],
                  'estimator__positive': [True, False]}

    # RandomizedSearchCV to find the best value of k
    randomized_search = RandomizedSearchCV(multioutput_regressor, param_grid, n_iter=4, cv=10, scoring='r2')
    randomized_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = randomized_search.best_params_

    # Remove estimator__ in front of every hyperparameter
    remove_str = "estimator__"
    
    for param in list(best_params.keys()):
        if remove_str in param:
            new_param = param.replace(remove_str, "")
            best_params[new_param] = best_params[param]
            del best_params[param]

    print(best_params)
    
    return best_params

In [6]:
def lr_train(X, y, is_one_tenth = False):
    # Split into training and testing data
    if is_one_tenth == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, shuffle=True, random_state=42)
    
    best_params = finetune_parameters(X_train, y_train)

    # Define and train model
    base_regressor = LinearRegression(**best_params)
    model = MultiOutputRegressor(base_regressor)
    model.fit(X_train, y_train)

    # Get cross validation accuracy
    val_accuracy = cross_val_score(model, X_train, y_train, cv=10, scoring="r2")
    val_accuracy = np.mean(val_accuracy)

    # Get test accuracy
    test_accuracy = model.score(X_test, y_test)

    # Get RMSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return val_accuracy, test_accuracy, rmse

In [7]:
def print_results(val_accuracy, test_accuracy, rmse, dataset_name):
    print(f"Performance of Linear Regression on {dataset_name}:")
    print("Cross Validation Accuracy = ", val_accuracy)
    print("Test Accuracy = ", test_accuracy)
    print("RMSE = ", rmse)

## Multi Dataset

In [8]:
val_rmse_multi, test_acc_multi, rmse_multi = lr_train(X_multi, y_multi)
print_results(val_rmse_multi, test_acc_multi, rmse_multi, "Multi Dataset")

{'positive': False, 'fit_intercept': True}
Performance of Linear Regression on Multi Dataset:
Cross Validation Accuracy =  0.00011944644299215242
Test Accuracy =  0.0025845568803451765
RMSE =  0.41040380121796166


In [9]:
print("Extra Credit #2 - Train the models on 1/10th of the data")
val_rmse_multi, test_acc_multi, rmse_multi = lr_train(X_multi, y_multi, True)
print_results(val_rmse_multi, test_acc_multi, rmse_multi, "Multi Dataset")

Extra Credit #2 - Train the models on 1/10th of the data
{'positive': True, 'fit_intercept': True}
Performance of Linear Regression on Multi Dataset:
Cross Validation Accuracy =  -0.024859712363494317
Test Accuracy =  -0.011865730134446572
RMSE =  0.4133240657113709
