In [66]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score



In [67]:
df = pd.read_csv('../data/BostonHousing.csv')
df.head()


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
df['is_old'] = df['age'].apply(lambda x: 1 if x > 70 else 0) #Turns the 'age' variable (percentage of homes built before 1940) into a simple Yes/No flag — helpful for catching sharp distinctions.
df['rm_lstat'] = df['rm'] * df['lstat'] #combines rooms and poverty level
df.head()


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,is_old,rm_lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,0,32.7435
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,1,58.68794
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,0,28.95555
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,0,20.57412
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,0,38.09351


In [None]:
df=df.drop(columns=["rm","lstat","age"])   #dropping these three columns because we have created new columns based on these three columns.
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,dis,rad,tax,ptratio,b,medv,is_old,rm_lstat
0,0.00632,18.0,2.31,0,0.538,4.09,1,296,15.3,396.9,24.0,0,32.7435
1,0.02731,0.0,7.07,0,0.469,4.9671,2,242,17.8,396.9,21.6,1,58.68794
2,0.02729,0.0,7.07,0,0.469,4.9671,2,242,17.8,392.83,34.7,0,28.95555
3,0.03237,0.0,2.18,0,0.458,6.0622,3,222,18.7,394.63,33.4,0,20.57412
4,0.06905,0.0,2.18,0,0.458,6.0622,3,222,18.7,396.9,36.2,0,38.09351


In [70]:
X = df.drop('medv', axis=1)
y = df['medv']
scaler=StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2,random_state=42)


In [71]:
X_train

array([[ 1.32780421, -0.48772236,  1.01599907, ..., -0.07887794,
         0.87353661,  1.44702744],
       [-0.34750602, -0.48772236, -0.43725801, ...,  0.42701755,
        -1.14477172, -0.70256665],
       [-0.41648392,  1.01446252, -0.74074945, ...,  0.06113692,
        -1.14477172, -0.75951714],
       ...,
       [-0.41877066,  2.94584308, -1.3316823 , ...,  0.37570436,
        -1.14477172, -0.95389807],
       [ 0.87825441, -0.48772236,  1.01599907, ..., -2.70626713,
         0.87353661,  2.02229398],
       [-0.39389588, -0.48772236, -0.37597609, ..., -3.13442533,
         0.87353661, -0.21730384]], shape=(404, 12))

In [72]:
model=LinearRegression()
model.fit(X_train,y_train)


In [73]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [75]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")


Mean Squared Error: 20.80
R-squared Score: 0.72


In [None]:
#Hence we increased our model's accuracy from 67% to 72% by combining some features(rm and lstat into rm_lstat) and changing some numerical features(age) into binary flag feature(is_old).