### 5. Feature Engineering and Improvement
#### Task 5: Feature Engineering

Notebook: notebooks/Feature_Engineering.ipynb
Steps:
- Create new features that might improve model performance.
- Test different feature combinations.
- Evaluate the impact of new features on model performance.


In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures


In [3]:
###Load the Preprocessed Dataset
def read_file(filename):
    filepath = '../Data/'+str(filename)
    return pd.read_csv(filepath)

X_test = read_file('X_test.csv')
y_test = read_file('y_test.csv')
X_train = read_file('X_train.csv')
y_train = read_file('y_train.csv')

print(len(X_test), len(X_train), len(y_test), len(y_train))

print("X_test data: \n", X_test)
print("y_test data: \n", y_test)

51 201 51 201
X_test data: 
         crim        zn     indus       nox        rm       age       dis  \
0  -0.236041  0.871318 -0.501726 -1.073076  0.439527 -0.440185  1.951028   
1  -0.214499  0.237584 -0.144178  0.338419 -0.580839  0.940816  1.253798   
2   0.089608 -0.596277  2.349763  1.856155 -0.930475  1.402401 -1.527156   
3   2.283708  0.737901 -0.837928  2.205235  2.565886  1.263550 -1.295462   
4  -0.332314 -0.596277 -0.823697  0.125936 -1.608341  1.038387 -1.003274   
5  -0.558640  2.072078 -0.403890 -0.830238  0.556073 -1.078148 -0.131059   
6   1.016799 -0.596277  0.216927  0.641966  0.919980  0.813224 -0.595803   
7  -0.626012  2.072078 -1.321774 -1.103430  0.575101 -0.616563  2.496430   
8  -0.505262  0.237584 -0.144178  0.338419 -0.854364 -0.819210  0.609211   
9  -0.323316 -0.596277 -0.314947 -0.815061 -0.188390 -2.035092  0.761716   
10 -0.543046 -0.596277  0.738129 -0.982012 -0.890041 -0.909276  0.073580   
11 -0.610532 -0.596277 -0.967784 -0.632932 -0.904312 -1.314

###Create New Feature

In [4]:
# Create new interaction features
X_train['LSTAT_RM'] = X_train['lstat'] * X_train['rm']
X_test['LSTAT_RM'] = X_test['lstat'] * X_test['rm']

# Add polynomial feature (e.g., squared value)
X_train['RM_squared'] = X_train['rm'] ** 2
X_test['RM_squared'] = X_test['rm'] ** 2


###Trained Liner Regression Model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict
y_pred = lr_model.predict(X_test)


### Evaluate Model

In [6]:
#Evaluate Model
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


Mean Squared Error (MSE): 4.440668506679386
R² Score: 0.7930514603017373


In [9]:
# Save the updated dataset with new features
import joblib
joblib.dump({
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}, "../data/engineered_data.pkl")

# Save the model
joblib.dump(lr_model, "../data/linear_regression_with_features.pkl")


['../data/linear_regression_with_features.pkl']

### Polynomial Features 

In [13]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train with poly features
poly_lr = LinearRegression()
poly_lr.fit(X_train_poly, y_train)

# Evaluate
y_poly_pred = poly_lr.predict(X_test_poly)
print("Poly MSE:", mean_squared_error(y_test, y_poly_pred))
print("Poly R²:", r2_score(y_test, y_poly_pred))


Poly MSE: 1.5391802423963225e+20
Poly R²: -7.173043946361264e+18


In [14]:
# Save the model
joblib.dump(y_poly_pred, "../data/y_polymial_predication.csv")

['../data/y_polymial_predication.csv']