In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the original dataset and preprocessed data
data = pd.read_csv('../data/BostonHousing.csv')
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

print("Original and preprocessed data loaded successfully!")
print("Original data shape:", data.shape)

Original and preprocessed data loaded successfully!
Original data shape: (506, 14)


In [4]:
# Create new features
# We'll add features to the original dataset before splitting/scaling
# 1. Interaction term: RM * LSTAT (rooms and socio-economic status might interact)
data['RM_LSTAT'] = data['RM'] * data['LSTAT']

# 2. Polynomial feature: LSTAT^2 (non-linear relationship with MEDV)
data['LSTAT_sq'] = data['LSTAT'] ** 2

# 3. Log transformation: CRIM (crime rate is skewed, log might help)
data['log_CRIM'] = np.log1p(data['CRIM'])  # log1p handles zeros

# 4. Ratio: TAX/RM (tax burden per room)
data['TAX_per_RM'] = data['TAX'] / data['RM']

print("\nNew features added:")
print(data[['RM_LSTAT', 'LSTAT_sq', 'log_CRIM', 'TAX_per_RM']].head())


New features added:
   RM_LSTAT  LSTAT_sq  log_CRIM  TAX_per_RM
0  32.74350   24.8004  0.006300   45.019011
1  58.68794   83.5396  0.026944   37.688834
2  28.95555   16.2409  0.026924   33.681280
3  20.57412    8.6436  0.031857   31.723350
4  38.09351   28.4089  0.066770   31.061984


In [5]:
# Step 2: Test different feature combinations
# Prepare features
X = data.drop('MEDV', axis=1)
y = data['MEDV']

# Standardize all features (including new ones)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split data again to ensure consistency
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [6]:
# Define feature sets to test
feature_sets = {
    'Original': X_train.columns,  # Original 13 features
    'Original + RM_LSTAT': list(X_train.columns) + ['RM_LSTAT'],
    'Original + LSTAT_sq': list(X_train.columns) + ['LSTAT_sq'],
    'Original + All New': X_train_new.columns  # All features including new ones
}

In [7]:
# Step 3: Evaluate the impact of new features on model performance
results = {}

for name, features in feature_sets.items():
    # Select features
    X_train_subset = X_train_new[features]
    X_test_subset = X_test_new[features]

    # Train Linear Regression model
    model = LinearRegression()
    model.fit(X_train_subset, y_train_new)

    # Predict and evaluate
    y_test_pred = model.predict(X_test_subset)
    mse = mean_squared_error(y_test_new, y_test_pred)
    r2 = r2_score(y_test_new, y_test_pred)

In [12]:
results[name] = {'MSE': mse, 'R²': r2}
print(f"\nPerformance with {name}:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

# Compare results
print("\nSummary of Feature Set Performance:")
for name, metrics in results.items():
    print(f"{name}: MSE = {metrics['MSE']:.2f}, R² = {metrics['R²']:.2f}")


Performance with Original + All New:
Mean Squared Error (MSE): 14.96
R² Score: 0.80

Summary of Feature Set Performance:
Original + All New: MSE = 14.96, R² = 0.80
