In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

def train_linear_regression(
    data: pd.DataFrame, 
    target: str, 
    alphas: list[int] = [0.1, 1, 10, 100],
    regularization: str = None,
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a linear regression model with optional regularization.
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    y = data[target]

    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

    # Create preprocessor with one-hot encoder for categories.
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', StandardScaler(), numerical_columns)
            # ('num', 'passthrough', numerical_columns)
        ])

    # Select appropriate regressor
    if regularization == 'ridge':
        regressor = Ridge()
        alphas = alphas#[1, 10.0, 100.0]#[5.0, 6, 7, 8, 9, 10.0, 12, 20.0]
        param_grid = {'regressor__alpha': alphas}
    elif regularization == 'lasso':
        regressor = Lasso(max_iter=50000, tol=0.001, selection='random')
        alphas = alphas#[0.1, 1, 10.0, 100.0]
        param_grid = {'regressor__alpha': alphas}
    else:
        regressor = LinearRegression()
        param_grid = {}

    # Create a pipeline with preprocessor and regression
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Fit the model
    if regularization in ['ridge', 'lasso']:
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(
            model, 
            param_grid, 
            cv=5, 
            scoring='neg_mean_squared_error'
        )
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        best_model = model
        best_params = {}

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names
    feature_names = (
        list(best_model.named_steps['preprocessor']
             .named_transformers_['cat']
             .get_feature_names_out(categorical_columns)) + 
        list(numerical_columns)
    )
    
    # Create a dataframe of coefficients
    coefficients = pd.DataFrame({
        'feature': feature_names,
        'importance': np.abs(best_model.named_steps['regressor'].coef_)
    }).sort_values('importance', ascending=False)

    # Return results
    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': coefficients,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

# df = pd.read_csv('data/train_transformed.csv')
# df = pd.read_csv('data/train_cleaned.csv')

# results_no_reg = train_linear_regression(df, target='SalePrice')
# results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge')
# results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso')

# print('Results with no regularization:')
# for metric, value in results_no_reg['performance'].items():
#     print(f"{metric}: {value}")

# print('Results with ridge regularization:')    
# for metric, value in results_ridge['performance'].items():
#     print(f"{metric}: {value}")
# print(results_ridge['best_params'])

    
# print('Results with lasso regularization:')
# for metric, value in results_lasso['performance'].items():
#     print(f"{metric}: {value}")
    


In [2]:
# # print(results_lasso['feature_importances'])
# print("Number of features with zero coefficient in lasso regularization:",(results_lasso['feature_importances']['importance']==0.0).sum(), 'out of', (results_lasso['feature_importances']['importance']).count())
# print("Number of features with zero coefficient in lasso regularization:",(results_lasso['feature_importances']['importance']!=0).sum())
# print("Number of features with zero coefficient in ridge regularization:",(results_ridge['feature_importances']['importance']==0.0).sum())


In [None]:
df = pd.read_csv('data/train_cleaned.csv')
# print(df.columns)

results_no_reg = train_linear_regression(df, target='SalePrice')

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge')

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso')
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
df = pd.read_csv('data/train_log1p.csv')
# print(df.columns)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge')

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso')
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])



Results with no regularization:
root_mean_squared_error: 0.13362253964398918
r2_score: 0.9043197657548931
Results with ridge regularization:
root_mean_squared_error: 0.13759280939027674
r2_score: 0.8985494833983114
{'regressor__alpha': 10.0}
Results with lasso regularization:
root_mean_squared_error: 0.25447669646872534
r2_score: 0.6529765501202527
{'regressor__alpha': 0.1}


In [None]:
df = pd.read_csv('data/train_full_EDA.csv')
print(df.columns)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge')

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso')
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])



Index(['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 'LandContour',
       'LotConfig', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'SaleType',
       'SaleCondition', 'SalePrice', 'TotalBsmtSF_1stFlrSF_PC',
  

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Results with no regularization:
root_mean_squared_error: 31210.930823417777
r2_score: 0.8730012027452739
Results with ridge regularization:
root_mean_squared_error: 32808.33352518662
r2_score: 0.8596687129562618
{'regressor__alpha': 100.0}
Results with lasso regularization:
root_mean_squared_error: 29469.005472553603
r2_score: 0.8867815709418356
{'regressor__alpha': 100.0}
