# **Feature Engineering and Improvement**

**Task 5. Feature Engineering**

In [None]:
# Create new features that might improve model performance
# Test different feature combinations
# Evaluate the impact of new features on model performance
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the original datasets
train_data = pd.read_csv('drive/MyDrive/Datasets/BostonHousing_train.csv')
test_data = pd.read_csv('drive/MyDrive/Datasets/BostonHousing_test.csv')

# Feature Engineering
def create_new_features(df):
    # Interaction features
    df['rm_lstat'] = df['rm'] * df['lstat']
    df['rm_age'] = df['rm'] * df['age']

    # Polynomial features
    df['rm_squared'] = df['rm'] ** 2
    df['lstat_squared'] = df['lstat'] ** 2

    # Log transformation
    df['log_lstat'] = np.log(df['lstat'])

    # Binning
    df['age_binned'] = pd.cut(df['age'], bins=[0, 25, 50, 75, 100], labels=['0-25', '25-50', '50-75', '75-100'])

    return df


In [None]:

# Apply feature engineering to the datasets
train_data = create_new_features(train_data)
test_data = create_new_features(test_data)

# Encode the binned feature
train_data = pd.get_dummies(train_data, columns=['age_binned'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['age_binned'], drop_first=True)

# Define feature sets including new features
feature_sets = [
    ['rm', 'lstat'],
    ['rm', 'lstat', 'rm_lstat'],
    ['rm', 'lstat', 'rm_squared', 'lstat_squared'],
    ['rm', 'lstat', 'log_lstat'],
    ['rm', 'lstat', 'age_binned_25-50', 'age_binned_50-75', 'age_binned_75-100'],
    ['rm', 'lstat', 'rm_lstat', 'rm_squared', 'lstat_squared', 'log_lstat', 'age_binned_25-50', 'age_binned_50-75', 'age_binned_75-100']
]

# Define preprocessing steps
preprocessing_steps = ['none', 'standard', 'minmax']

# Function to preprocess the data
def preprocess_data(X_train, X_test, method):
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        return X_train, X_test

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

In [None]:
# Collect performance metrics
results = []

for features in feature_sets:
    for method in preprocessing_steps:
        # Prepare training and testing data
        X_train = train_data[features]
        y_train = train_data['medv']
        X_test = test_data[features]
        y_test = test_data['medv']

        # Preprocess data
        X_train_preprocessed, X_test_preprocessed = preprocess_data(X_train, X_test, method)

        # Train the model
        model = LinearRegression(fit_intercept=False)
        model.fit(X_train_preprocessed, y_train)

        # Make predictions
        y_pred = model.predict(X_test_preprocessed)

        # Evaluate the model
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store the results
        results.append({
            'features': features,
            'preprocessing': method,
            'metrics (MSE, R2)': (round(mse, 4), round(r2, 4))
        })

# Convert the results to a DataFrame for better visualization
results_df = pd.DataFrame(results)
print(results_df)

# Save the results to a CSV file
results_df.to_csv('drive/MyDrive/Datasets/model_performance_comparison_with_new_features.csv', index=False)

print("Trained with new features and saved successfully.")


                                             features preprocessing  \
0                                         [rm, lstat]          none   
1                                         [rm, lstat]      standard   
2                                         [rm, lstat]        minmax   
3                               [rm, lstat, rm_lstat]          none   
4                               [rm, lstat, rm_lstat]      standard   
5                               [rm, lstat, rm_lstat]        minmax   
6              [rm, lstat, rm_squared, lstat_squared]          none   
7              [rm, lstat, rm_squared, lstat_squared]      standard   
8              [rm, lstat, rm_squared, lstat_squared]        minmax   
9                              [rm, lstat, log_lstat]          none   
10                             [rm, lstat, log_lstat]      standard   
11                             [rm, lstat, log_lstat]        minmax   
12  [rm, lstat, age_binned_25-50, age_binned_50-75...          none   
13  [r

- The models trained without any preprocessing (scaling) generally performed better than those with preprocessing.
- The best performance was obtained with the feature set [rm, lstat, rm_lstat, rm_squared, lstat_squared, log_lstat, age_binned_25-50, age_binned_50-75, age_binned_75-100], yielding an MSE of 18.5781 and an R2 of 0.7445.