In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

def train_linear_regression(
    data: pd.DataFrame, 
    target: str, 
    regularization: str = None,
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a linear regression model with optional regularization.
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    y = data[target]

    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

    # Create preprocessor with one-hot encoder for categories.
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', 'passthrough', numerical_columns)
        ])

    # Select appropriate regressor
    if regularization == 'ridge':
        regressor = Ridge()
        alphas = [5.0, 6, 7, 8, 9, 10.0, 12, 20.0]
        param_grid = {'regressor__alpha': alphas}
    elif regularization == 'lasso':
        regressor = Lasso(max_iter=10000, tol=0.001, selection='random')
        alphas = [10.0, 100.0, 1000.0]
        param_grid = {'regressor__alpha': alphas}
    else:
        regressor = LinearRegression()
        param_grid = {}

    # Create a pipeline with preprocessor and regression
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Fit the model
    if regularization in ['ridge', 'lasso']:
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(
            model, 
            param_grid, 
            cv=5, 
            scoring='neg_mean_squared_error'
        )
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        best_model = model
        best_params = {}

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names
    feature_names = (
        list(best_model.named_steps['preprocessor']
             .named_transformers_['cat']
             .get_feature_names_out(categorical_columns)) + 
        list(numerical_columns)
    )
    
    # Create a dataframe of coefficients
    coefficients = pd.DataFrame({
        'feature': feature_names,
        'importance': np.abs(best_model.named_steps['regressor'].coef_)
    }).sort_values('importance', ascending=False)

    # Return results
    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': coefficients,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

df = pd.read_csv('data/train_transformed.csv')
# df = pd.read_csv('data/train_cleaned.csv')

results_no_reg = train_linear_regression(df, target='SalePrice')
results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge')
results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso')

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
    


Results with no regularization:
root_mean_squared_error: 30756.73736256783
r2_score: 0.8766705785858766
Results with ridge regularization:
root_mean_squared_error: 32236.909139519586
r2_score: 0.8645144570388819
{'regressor__alpha': 6}
Results with lasso regularization:
root_mean_squared_error: 29397.44187129908
r2_score: 0.8873307907339534


In [2]:
# print(results_lasso['feature_importances'])
print("Number of features with zero coefficient in lasso regularization:",(results_lasso['feature_importances']['importance']==0.0).sum())
print("Number of features with zero coefficient in lasso regularization:",(results_ridge['feature_importances']['importance']==0.0).sum())


Number of features with zero coefficient in lasso regularization: 259
Number of features with zero coefficient in lasso regularization: 0
