In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('../data/boston_housing.csv.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
column_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
df.columns = column_names

In [5]:
print("Original Dataset:")
print(df.head())

Original Dataset:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  


In [6]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [7]:
df.fillna(df.median(), inplace=True)

In [8]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

# Feature Engineering

In [9]:
'''
RM_ZN: Interaction between the average number of rooms (RM) and the proportion of residential land zoned for lots (ZN).
CRIM_TAX: Interaction between crime rate (CRIM) and property tax rate (TAX).
'''

df['RM_ZN'] = df['RM'] * df['ZN']
df['CRIM_TAX'] = df['CRIM'] * df['TAX']

In [10]:
#Polynomial Features
'''
RM_squared: Square of the average number of rooms (RM).
LSTAT_cubed: Cube of the percentage of lower-status population (LSTAT).
'''

df['RM_squared'] = df['RM'] ** 2
df['LSTAT_cubed'] = df['LSTAT'] ** 3

In [11]:
#Binning Features
'''
bins: Defines the bin edges (e.g., 0–35, 36–70, 71+).
labels: Assigns category names to the bins.
pd.cut: Performs the binning operation.
'''

bins = [0, 35, 70, 100]
labels = ['Young', 'Middle-aged', 'Old']
df['AGE_category'] = pd.cut(df['AGE'], bins=bins, labels=labels)

In [12]:
df['AGE_category']

0      Middle-aged
1              Old
2      Middle-aged
3      Middle-aged
4      Middle-aged
          ...     
501    Middle-aged
502            Old
503            Old
504            Old
505            Old
Name: AGE_category, Length: 506, dtype: category
Categories (3, object): ['Young' < 'Middle-aged' < 'Old']

In [13]:
#Aggregation Features: Computes group statistics (e.g., median) for a feature
'''
groupby('CHAS'): Groups the data by the Charles River dummy variable (CHAS).
transform('median'): Calculates the median MEDV for each group and assigns it to all rows in the group.
MEDV_median_by_CHAS: Stores the computed median values.
'''

median_MEDV_by_CHAS = df.groupby('CHAS')['MEDV'].transform('median')
df['MEDV_median_by_CHAS'] = median_MEDV_by_CHAS
df['MEDV_median_by_CHAS'].head()

0    20.85
1    20.85
2    20.85
3    20.85
4    20.85
Name: MEDV_median_by_CHAS, dtype: float64

In [14]:
#Log Transformations: Applies a logarithmic transformation to reduce skewness in skewed features
'''
np.log1p: Computes the natural logarithm of x + 1 to handle zero values gracefully.
log_CRIM: Log-transformed crime rate (CRIM).
log_DIS: Log-transformed weighted distances to employment centers (DIS).
'''

df['log_CRIM'] = np.log1p(df['CRIM'])  # Use log1p to handle zero values
df['log_DIS'] = np.log1p(df['DIS'])

In [15]:
print("\nDataset After Feature Engineering:")
print(df.head())


Dataset After Feature Engineering:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  ...  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296  ...   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242  ...   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242  ...   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222  ...   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222  ...   

   LSTAT  MEDV   RM_ZN  CRIM_TAX  RM_squared  LSTAT_cubed  AGE_category  \
0   4.98  24.0  118.35   1.87072   43.230625   123.505992   Middle-aged   
1   9.14  21.6    0.00   6.60902   41.229241   763.551944           Old   
2   4.03  34.7    0.00   6.60418   51.624225    65.450827   Middle-aged   
3   2.94  33.4    0.00   7.18614   48.972004    25.412184   Middle-aged   
4  11.43  36.2    0.00  15.32910   51.079609  1493.271207   Middle-aged   

   MEDV_median_by_CHAS  log_CRIM   log_DIS  
0          

In [16]:
df.isnull().sum()

CRIM                   0
ZN                     0
INDUS                  0
CHAS                   0
NOX                    0
RM                     0
AGE                    0
DIS                    0
RAD                    0
TAX                    0
PTRATIO                0
B                      0
LSTAT                  0
MEDV                   0
RM_ZN                  0
CRIM_TAX               0
RM_squared             0
LSTAT_cubed            0
AGE_category           0
MEDV_median_by_CHAS    0
log_CRIM               0
log_DIS                0
dtype: int64

In [17]:
X = df.drop(columns=['MEDV'])  # Features
y = df['MEDV']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
#TRAINING A BASELINE MODEL
baseline_X_train = X_train[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]
baseline_X_test = X_test[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]

baseline_model = LinearRegression()
baseline_model.fit(baseline_X_train, y_train)

y_pred_baseline = baseline_model.predict(baseline_X_test)
mse_baseline = mean_squared_error(y_test, y_pred_baseline)

print(f"Baseline Model MSE: {mse_baseline}")

Baseline Model MSE: 24.99938479010357


In [19]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to the 'AGE_category' column
label_encoder = LabelEncoder()
df['AGE_category_encoded'] = label_encoder.fit_transform(df['AGE_category'])

In [20]:
non_numeric_columns = df.select_dtypes(include=['object']).columns
non_numeric_columns

Index([], dtype='object')

In [21]:
label_encoder = LabelEncoder()
for col in non_numeric_columns:
    df[col] = label_encoder.fit_transform(df[col])

In [22]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [23]:
new_features = [
    'RM_ZN', 'CRIM_TAX', 'RM_squared', 'LSTAT_cubed',
    'MEDV_median_by_CHAS', 'log_CRIM', 'log_DIS'
]

In [24]:
X_train_new = pd.concat([X_train, df.loc[X_train.index, new_features]], axis=1)
X_test_new = pd.concat([X_test, df.loc[X_test.index, new_features]], axis=1)
X_test_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 173 to 75
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   CRIM                 102 non-null    float64 
 1   ZN                   102 non-null    float64 
 2   INDUS                102 non-null    float64 
 3   CHAS                 102 non-null    float64 
 4   NOX                  102 non-null    float64 
 5   RM                   102 non-null    float64 
 6   AGE                  102 non-null    float64 
 7   DIS                  102 non-null    float64 
 8   RAD                  102 non-null    int64   
 9   TAX                  102 non-null    int64   
 10  PTRATIO              102 non-null    float64 
 11  B                    102 non-null    float64 
 12  LSTAT                102 non-null    float64 
 13  RM_ZN                102 non-null    float64 
 14  CRIM_TAX             102 non-null    float64 
 15  RM_squared           102 no

In [25]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to the 'age_category' column
label_encoder = LabelEncoder()
X_train_new['AGE_category'] = label_encoder.fit_transform(X_train_new['AGE_category'])
X_test_new['AGE_category'] = label_encoder.transform(X_test_new['AGE_category'])
X_train_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 404 entries, 477 to 102
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CRIM                 404 non-null    float64
 1   ZN                   404 non-null    float64
 2   INDUS                404 non-null    float64
 3   CHAS                 404 non-null    float64
 4   NOX                  404 non-null    float64
 5   RM                   404 non-null    float64
 6   AGE                  404 non-null    float64
 7   DIS                  404 non-null    float64
 8   RAD                  404 non-null    int64  
 9   TAX                  404 non-null    int64  
 10  PTRATIO              404 non-null    float64
 11  B                    404 non-null    float64
 12  LSTAT                404 non-null    float64
 13  RM_ZN                404 non-null    float64
 14  CRIM_TAX             404 non-null    float64
 15  RM_squared           404 non-null    float6

In [26]:
new_model = LinearRegression()
new_model.fit(X_train_new, y_train)

In [27]:
y_pred_new = new_model.predict(X_test_new)
mse_new = mean_squared_error(y_test, y_pred_new)

print(f"New Model MSE: {mse_new}")

New Model MSE: 12.397976394335956
