In [4]:
# Loading libraries
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
data = pd.read_csv("housing.csv")
data.info()
data.head()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [2]:
# EDA Report
profile=ProfileReport(data, title="Housing Data Profiling Report")
profile.to_file("housing_data_profiling_report.html")

100%|██████████| 10/10 [00:00<00:00, 66.25it/s]<00:00, 37.01it/s, Describe variable: ocean_proximity]   
Summarize dataset: 100%|██████████| 100/100 [00:11<00:00,  8.73it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 66.79it/s]


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Separate features and target
X = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

# Identify categorical and numerical columns
cat_features = ["ocean_proximity"]
num_features = X.drop(cat_features, axis=1).columns

# Build pipeline: impute → scale → model
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),   # fill NaNs with median
    ('scaler', StandardScaler()),                   # scale features
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")

Mean Squared Error: 2362554880.05
Mean Absolute Error: 31694.23
Root Mean Squared Error: 48606.12
R2 Score: 0.82


In [26]:
print("Sample Predicted Prices:", y_pred[:5])

# Scatter plot of Actual vs Predicted
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Housing Prices")
plt.savefig("scatter_plot.png") 
plt.show()

Sample Predicted Prices: [ 50764.    67113.   472226.58 254462.02 267130.  ]


  plt.show()
