# Feature Engineering

In [25]:
# Import the libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('..\data\BostonHousing.csv')
df.head()

  df = pd.read_csv('..\data\BostonHousing.csv')


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


# New features that can be introduced
1. **Crime to Population Lower Status Ratio:** The areas with high crime and the area with lower status people can effect the MEDV.
2. **Urbanization Index:** Areas with significant industrial value and high property tax are more urbanized and can have more property value which effects the medv.
3. **Room Categories:** Houses can be categorized on the basis of number of rooms.

In [26]:
#Crime and Proportion of Lower Status
df['crim_lstat'] = df['crim'] * df['lstat']

#Urbanization Index
df['urban_index'] = df['indus'] + df['tax']

#Room Categories
df['room_category'] = pd.cut(df['rm'], bins=[0, 4, 6, 8, 10], labels=['small', 'medium', 'large', 'very_large'])

df.head()


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,crim_lstat,urban_index,room_category
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,0.031474,298.31,large
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0.249613,249.07,large
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,0.109979,249.07,large
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,0.095168,224.18,large
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,0.368036,224.18,large


**Mean Squared Error from previous datasets:** 27.40683628433727
**R-squared:** 0.6262727067493545

In [27]:
#Import the libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

#Using One-Hot encoding for room_category to represent it in numerical value
df = pd.get_dummies(df, columns=['room_category'], drop_first=True)
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,crim_lstat,urban_index,room_category_medium,room_category_large,room_category_very_large
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,0.031474,298.31,False,True,False
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0.249613,249.07,False,True,False
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,0.109979,249.07,False,True,False
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,0.095168,224.18,False,True,False
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,0.368036,224.18,False,True,False


In [28]:
#Initiatize the data with new features
X = df.drop(columns=['medv'])
y = df['medv']

#Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Scaling the data using Standard Scaler
scaler= StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [29]:
#Initialize the model
model =LinearRegression()

#Train the data
model.fit(X_train, y_train)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
#New Mean Squared error with new features
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 25.064241015239265
R-squared: 0.6582169917452152


### Comparision of the datasets before and after New Features are added
- As we can see, both the Mean Squared Error(MSE) and R-Square Error have changes after the addition of new features.

- **Before Addition of New Features:**
    1. Mean Squared Error: 27.40683628433727
    2. R-squared: 0.6262727067493545

- **After Addition of New Features:**
    1. Mean Squared Error: 25.064241015239265
    2. R-squared: 0.6582169917452152

- Higher R-square means that there is more variance so the model fits the data more and the decrease in Mean Squared Error(MSE)means the predicted values are closer to the actual values which indicates better model performnace.