In [None]:
# 0. 🛠️ Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 1️⃣ Load Data
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)
print(df.head())
print(
"
Missing values:
", df.isnull().sum())

In [None]:
# 2️⃣ Preprocess
df = df.dropna()

In [None]:
# 3️⃣ Visualize key relationships
sns.pairplot(df[['rm','lstat','ptratio','medv']])
plt.suptitle("Feature relationships vs Price (medv)", y=1.02)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# 4️⃣ Select Features & Split
X = df[['rm','lstat','ptratio']]
y = df['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# 5️⃣ Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# 6️⃣ Predict & Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.2f}")

In [None]:
# 7️⃣ Plot actual vs predicted
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted House Prices")
plt.show()

### 🔍 Summary:
- Used Boston Housing dataset with 506 samples and multiple features.
- Cleaned data (no missing values), selected 3 key predictors.
- Visualized relationships & correlations.
- Trained Linear Regression model (80/20 split).
- Insight: More rooms and lower low-status population tend to increase house prices.