In [17]:
# notebooks/Feature_Engineering.ipynb
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score




In [19]:
df = pd.read_csv('../data/HousingData.csv')


In [21]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Encode categorical variables (CHAS)
df_imputed = pd.get_dummies(df_imputed, columns=['CHAS'], drop_first=True)

# Normalize/standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_imputed.drop('MEDV', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=df_imputed.columns[:-1])

In [24]:
# Feature engineering: create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(scaled_df)
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(scaled_df.columns))

# Combine polynomial features with target variable
poly_df['MEDV'] = df_imputed['MEDV']

# Save the engineered dataset
poly_df.to_csv('../data/poly_features.csv', index=False)

