In [1]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# Simulated house prices dataset
data = {
    'LotArea': [8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120, 7420],
    'OverallQual': [7, 6, 7, 7, 8, 5, 8, 7, 7, 5],
    'OverallCond': [5, 8, 5, 5, 5, 5, 5, 6, 5, 7],
    'YearBuilt': [2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1939],
    'GrLivArea': [1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 1077],
    'GarageCars': [2, 2, 2, 3, 3, 2, 3, 2, 2, 1],
    'SalePrice': [208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000]
}

df = pd.DataFrame(data)

# Step 2: Split features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Step 3 & 4: Create pipeline with feature selection and Lasso regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_regression)),
    ('model', Lasso(max_iter=10000))
])

# GridSearchCV for hyperparameter tuning
param_grid = {
    'feature_selection__k': [3, 4, 5],
    'model__alpha': [0.1, 1.0, 10.0]
}

grid = GridSearchCV(pipeline, param_grid, scoring='r2', cv=3)
grid.fit(X, y)

# Best model evaluation
y_pred = grid.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("Best parameters:", grid.best_params_)
print("Mean Squared Error (on training):", round(mse, 2))
print("R² Score (on training):", round(r2, 2))


Best parameters: {'feature_selection__k': 3, 'model__alpha': 10.0}
Mean Squared Error (on training): 364863137.38
R² Score (on training): 0.89
