# Linear regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer

def train_linear_regression(
    data: pd.DataFrame, 
    target: str, 
    alphas: list[int] = [0.1, 1, 10, 100],
    num_transformer: str = 'robust',
    regularization: str = None,
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a linear regression model with optional regularization.
    Calculates R² correctly for log-transformed targets.
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    
    # Determine scaling and target transformation
    use_log_transform = num_transformer in ['log', 'log+robust']
    y = data[target]
    
    if num_transformer == 'robust': 
        num_pipeline = RobustScaler()
    elif num_transformer == 'standard': 
        num_pipeline = StandardScaler()
    elif num_transformer == 'log+robust':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log+standard':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log':
        num_pipeline = log_transformer
        y = np.log1p(y)
    else:
        num_pipeline = 'passthrough'

    # Create preprocessor with one-hot encoder for categories
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', num_pipeline, numerical_columns)
        ])

    # Select appropriate regressor
    if regularization == 'ridge':
        regressor = Ridge()
        param_grid = {'regressor__alpha': alphas}
    elif regularization == 'lasso':
        regressor = Lasso(max_iter=50000, tol=0.001, selection='random') # for better stability reduced tolerance and large max_iter
        param_grid = {'regressor__alpha': alphas}
    else:
        regressor = LinearRegression()
        param_grid = {}

    # Create a pipeline with preprocessor and regression
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Fit the model
    if regularization in ['ridge', 'lasso']:
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(
            model, 
            param_grid, 
            cv=5, 
            scoring='neg_root_mean_squared_error'
        )
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        best_model = model
        best_params = {}

    # Correct R² calculation for log-transformed targets
    if use_log_transform:
        # Inverse transform predictions and actual values
        y_pred_original = np.expm1(y_pred)
        y_test_original = np.expm1(y_test)
        
        # Calculate MSE and R² on original scale
        mse = mean_squared_error(y_test_original, y_pred_original)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test_original, y_pred_original)
    else:
        # Use original scale calculations if no log transform
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

    # Get feature names
    feature_names = (
        list(best_model.named_steps['preprocessor']
             .named_transformers_['cat']
             .get_feature_names_out(categorical_columns)) + 
        list(numerical_columns)
    )
    
    # Create a dataframe of coefficients
    coefficients = pd.DataFrame({
        'feature': feature_names,
        'importance': np.abs(best_model.named_steps['regressor'].coef_)
    }).sort_values('importance', ascending=False)

    # Return results
    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': coefficients,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

In [3]:
# # print(results_lasso['feature_importances'])
# print("Number of features with zero coefficient in lasso regularization:",(results_lasso['feature_importances']['importance']==0.0).sum(), 'out of', (results_lasso['feature_importances']['importance']).count())
# print("Number of features with zero coefficient in lasso regularization:",(results_lasso['feature_importances']['importance']!=0).sum())
# print("Number of features with zero coefficient in ridge regularization:",(results_ridge['feature_importances']['importance']==0.0).sum())


For only cleaned data we necessarily need to apply log, so either log or log+robust

In [4]:
df = pd.read_csv('data/train_cleaned.csv')
# print(df.columns)

trans= 'log'


results_no_reg = train_linear_regression(df, target='SalePrice', num_transformer=trans)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge', alphas =[1, 10, 100, 200], num_transformer=trans)

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso', alphas =[0.00001, 0.0001, 0.001, 0.01], num_transformer=trans)
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])

Results with no regularization:
root_mean_squared_error: 24585.016534232615
r2_score: 0.9211998262313611
Results with ridge regularization:
root_mean_squared_error: 26611.495349261215
r2_score: 0.9076738504210954
{'regressor__alpha': 10}
Results with lasso regularization:
root_mean_squared_error: 27724.219094653377
r2_score: 0.8997914425480196
{'regressor__alpha': 0.001}


In [5]:
df = pd.read_csv('data/train_cleaned.csv')
# print(df.columns)

trans= 'log+robust'


results_no_reg = train_linear_regression(df, target='SalePrice', num_transformer=trans)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge', alphas =[1, 10, 100, 200], num_transformer=trans)

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso', alphas =[0.0001, 0.001, 0.01], num_transformer=trans)
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])

Results with no regularization:
root_mean_squared_error: 24920.74970868182
r2_score: 0.9190329395038187
Results with ridge regularization:
root_mean_squared_error: 25945.05395315101
r2_score: 0.9122402611767442
{'regressor__alpha': 10}
Results with lasso regularization:
root_mean_squared_error: 27284.275000960046
r2_score: 0.9029465446162541
{'regressor__alpha': 0.001}


In [6]:
df = pd.read_csv('data/train_full_EDA.csv')
# print(df.columns)

trans= 'log+robust'


results_no_reg = train_linear_regression(df, target='SalePrice', num_transformer=trans)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge', alphas =[1, 10, 100, 200], num_transformer=trans)

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso', alphas =[0.0001, 0.001, 0.01], num_transformer=trans)
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])



Results with no regularization:
root_mean_squared_error: 24727.639766341188
r2_score: 0.9202828990713587
Results with ridge regularization:
root_mean_squared_error: 25916.506952209693
r2_score: 0.9124332767002035
{'regressor__alpha': 10}
Results with lasso regularization:
root_mean_squared_error: 27553.768904810608
r2_score: 0.9010198314449358
{'regressor__alpha': 0.001}


In [7]:
df = pd.read_csv('data/train_full_EDA.csv')
# print(df.columns)

trans= 'log+standard'


results_no_reg = train_linear_regression(df, target='SalePrice', num_transformer=trans)

print('Results with no regularization:')
for metric, value in results_no_reg['performance'].items():
    print(f"{metric}: {value}")


results_ridge = train_linear_regression(df, target='SalePrice', regularization='ridge', alphas =[1, 10, 100, 200], num_transformer=trans)

print('Results with ridge regularization:')    
for metric, value in results_ridge['performance'].items():
    print(f"{metric}: {value}")
print(results_ridge['best_params'])

results_lasso = train_linear_regression(df, target='SalePrice', regularization='lasso', alphas =[0.0001, 0.001, 0.01], num_transformer=trans)
    
print('Results with lasso regularization:')
for metric, value in results_lasso['performance'].items():
    print(f"{metric}: {value}")
print(results_lasso['best_params'])



Results with no regularization:
root_mean_squared_error: 0.13191076161573426
r2_score: 0.9067554962244819
Results with ridge regularization:
root_mean_squared_error: 0.1366505713849984
r2_score: 0.8999341957242911
{'regressor__alpha': 10}
Results with lasso regularization:
root_mean_squared_error: 0.1412809490365238
r2_score: 0.893037882111975
{'regressor__alpha': 0.001}


### Linear regression conclusions:

From above we can see, that the best R2 is obtained for cleaned-only data using no regularization. Of course, given that we have almost 400 predictors (most from one-hot-encoding) and 1560 rows, there is no problem with overfitting to the data. Linear models tend to have relativily large bias due to strong assumptions, and regularization only enhances the bias without lowering variance. 

Somehow surprisingly the performance of the model with very good already, which indicates that linearity assumption approximately holds.

# Decision tree