# Linear Regression

In [33]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
%run -i ./Model_Eval.ipynb

In [2]:
training_data = pd.read_csv('../Datasets/training_data_with_aqi.csv').sample(frac=1) # shuffle

In [13]:
# import from Model_Eval.ipynb
numerical_features = get_numerical_features(training_data)

X = training_data[numerical_features]
Y = training_data['DIABETES_3Y_Change_Percentage']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

num_features = len(numerical_features)

# Define SKLearn pipeline, with PCA

## Will try several different options for number of components

In [12]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Optional: Standardize the features
    ('pca', PCA()),
    ('regression', LinearRegression())
])

In [29]:
num_component_options = list(
    range(1, num_features, 4)
)
num_component_options.append(44)
num_component_options

[1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 44]

In [30]:
# Define the parameter grid for grid search
param_grid = {
    'pca__n_components': num_component_options  # Number of PCA components to consider
}

In [34]:
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, Y_train)

In [35]:
# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'pca__n_components': 44}
Best Score: 0.1266325658922451


# Found that still ideal to use all principal components, for linear regression