## Hard (Feature engineering + selection) – DS1: Create 3 engineered features (e.g., TotalSF). Use L1-regularized Logistic? (No) → Use Lasso regression (sklearn) and explain which features survive + why.

In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

train_df = pd.read_csv('train.csv')

# Missing values
missing_values = train_df.isnull().sum().sort_values(ascending=False)
print("\nMissing values (top 10):")
print(missing_values.head(10))


Missing values (top 10):
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
dtype: int64


In [20]:
numeric_df = train_df.select_dtypes(include='number')
correlations = numeric_df.corr()['SalePrice'].sort_values(ascending=False)
print("\nTop correlations with SalePrice:")
print(correlations.head(10))


Top correlations with SalePrice:
SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
Name: SalePrice, dtype: float64


In [21]:
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF']
target = 'SalePrice'

In [22]:
# Check for missing values in selected features
print("\nMissing values in selected features:")
print(train_df[features].isnull().sum())


Missing values in selected features:
OverallQual    0
GrLivArea      0
GarageCars     0
GarageArea     0
TotalBsmtSF    0
dtype: int64


In [23]:
X = train_df[features]
y = train_df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [24]:
# Log-transform target
y_train_log = np.log(y_train)

# Fit model on log-target
model_log = LinearRegression()
model_log.fit(X_train, y_train_log)

# Predict and convert back using np.exp()
preds_log_scaled = model_log.predict(X_val)
preds_final = np.exp(preds_log_scaled)

# Calculate final RMSE
rmse_log = np.sqrt(mean_squared_error(y_val, preds_final))
print(f"Improved RMSE: {rmse_log}")

Improved RMSE: 32297.2684989162


### Create 3 Engineered Features

In [25]:
train_df['TotalSF'] = train_df['GrLivArea'] + train_df['TotalBsmtSF']
train_df['TotalBath'] = train_df['FullBath'] + (0.5 * train_df['HalfBath']) + train_df['BsmtFullBath'] + (0.5 * train_df['BsmtHalfBath'])
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']

# Select numeric columns for the Hard task
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ['SalePrice', 'Id']]

X_hard = train_df[numeric_cols].fillna(0) # Lasso doesn't accept missing values
y_hard = train_df['SalePrice']

X_train_h, X_val_h, y_train_h, y_val_h = train_test_split(X_hard, y_hard, test_size=0.2, random_state=42)

In [26]:
# Lasso Regression (Requires Scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_h)
X_val_scaled = scaler.transform(X_val_h)

lasso = Lasso(alpha=100, random_state=42)
lasso.fit(X_train_scaled, y_train_h)

0,1,2
,alpha,100
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [27]:
# Display coefficients for the engineered features
coeffs = pd.Series(lasso.coef_, index=numeric_cols)
print("\nHard Task - Engineered Feature Coefficients:")
print(f"TotalSF: {coeffs['TotalSF']:.2f}")
print(f"TotalBath: {coeffs['TotalBath']:.2f}")
print(f"HouseAge: {coeffs['HouseAge']:.2f}")


Hard Task - Engineered Feature Coefficients:
TotalSF: 4407.09
TotalBath: 0.00
HouseAge: -3983.73
