### 5. Feature Engineering and Improvement
#### Task 5: Feature Engineering

Notebook: notebooks/Feature_Engineering.ipynb
Steps:
- Create new features that might improve model performance.
- Test different feature combinations.
- Evaluate the impact of new features on model performance.


In [36]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures


In [45]:
# # Load dataset
file_path = (r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\BostonHousing.csv") 
df = pd.read_csv(file_path)  # Read CSV into a DataFrame

# Load the preprocessed dataset
X_test = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\X_test.csv")
X_train = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\X_train.csv")
y_test = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\y_test.csv")
y_train = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\y_train.csv")

# Display the first few rows
print(X_test.head())
print(X_train.head())
print(y_test.head())
print(y_train.head())


       crim        zn     indus      chas       nox        rm       age  \
0 -0.236041  0.871318 -0.501726 -0.260378 -1.073076  0.439527 -0.440185   
1 -0.214499  0.237584 -0.144178 -0.260378  0.338419 -0.580839  0.940816   
2  0.089608 -0.596277  2.349763 -0.260378  1.856155 -0.930475  1.402401   
3  2.283708  0.737901 -0.837928 -0.260378  2.205235  2.565886  1.263550   
4 -0.332314 -0.596277 -0.823697 -0.260378  0.125936 -1.608341  1.038387   

        dis       rad       tax   ptratio         b     lstat  
0  1.951028  1.484407  0.243996  0.358605 -2.474124 -0.268091  
1  1.253798  0.281608 -0.003673 -1.785172 -0.692920  1.404934  
2 -1.527156 -0.319792  1.638760  1.512946  0.098231  2.336354  
3 -1.295462  0.281608 -0.616326 -2.994482  0.303070  0.113747  
4 -1.003274  0.281608 -0.199200 -1.015611  0.819619  0.873009  
       crim        zn     indus      chas       nox        rm       age  \
0  0.569941 -0.596277  0.339668  3.840573 -0.192789 -2.007925  1.042140   
1 -0.561767 -0.

In [46]:
# Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Drop rows with NaN values (only if few values are missing)
df = df.dropna()

Missing values before handling:
 crim       0
zn         0
indus      0
chas       0
nox        0
rm         5
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


###Create New Features

In [47]:
#Log Transformation (Handling Skewness)
# Apply log transformation to features with high skewness
df['lstat_log'] = np.log1p(df['lstat'])  # Log transformation
df['rm_squared'] = df['rm'] ** 2  # Square transformation


In [48]:
#Polynomial Features (Adding Complexity)
# Select relevant features
poly_features = ['lstat', 'rm', 'ptratio']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_transformed = poly.fit_transform(df[poly_features])

# Convert to DataFrame and add to original dataset
poly_df = pd.DataFrame(poly_transformed, columns=poly.get_feature_names_out(poly_features))
df = pd.concat([df, poly_df], axis=1)


In [49]:
print(df.columns[df.columns.duplicated()])


Index(['lstat', 'rm', 'ptratio'], dtype='object')


In [50]:
# Remove duplicate columns if any
df = df.loc[:, ~df.columns.duplicated()].copy()

 ###Train and Evaluate the Model with New Features

In [53]:
# Ensure correct data types
df['rm'] = df['rm'].astype(float)
df['lstat'] = df['lstat'].astype(float)

# Create unique interaction feature names
df['rm_lstat_interaction'] = df['rm'].values * df['lstat'].values
df['age_dis_interaction'] = df['age'].values * df['dis'].values

# Check if the new columns are correctly added
print(df.columns)


Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv', 'lstat_log', 'rm_squared', 'lstat^2',
       'lstat rm', 'lstat ptratio', 'rm^2', 'rm ptratio', 'ptratio^2',
       'rm_lstat_interaction', 'age_dis_interaction'],
      dtype='object')


In [55]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Assuming df is your DataFrame with features and target variable

# Impute missing values in features (X) using mean for numerical features
X_train = df.drop('medv', axis=1)  # Drop target column
y_train = df['medv']  # Target column

# Apply SimpleImputer to impute missing values in the features
imputer = SimpleImputer(strategy='mean')  # You can change the strategy (e.g., median, mode)
X_train_imputed = imputer.fit_transform(X_train)

# Impute target variable if it has missing values
y_train_imputed = y_train.fillna(y_train.mean())  # Impute target using mean or median


###Interaction Features (Feature Interactions)

In [59]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel


# Step 2: Handle missing values
imputer = SimpleImputer(strategy='mean')
X = df.drop('medv', axis=1)  # Replace with your actual target column name
y = df['medv']
X_imputed = imputer.fit_transform(X)

####Compare Model Performance

In [60]:
# Train a baseline model with original features
X_baseline = df[['lstat', 'rm', 'ptratio']]  # Using only basic features
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_baseline, y, test_size=0.2, random_state=42)

# Train the baseline model
baseline_model = LinearRegression()
baseline_model.fit(X_train_base, y_train_base)
y_pred_base = baseline_model.predict(X_test_base)

# Evaluate baseline performance
mse_base = mean_squared_error(y_test_base, y_pred_base)
r2_base = r2_score(y_test_base, y_pred_base)

print(f"\nBaseline Model Performance:")
print(f"Mean Squared Error (MSE): {mse_base:.2f}")
print(f"R-squared (R²): {r2_base:.2f}")

# Compare improvement
improvement = ((r2 - r2_base) / r2_base) * 100
print(f"\nImprovement in R² Score: {improvement:.2f}%")


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values