# End-to-End ML Pipeline â€” California Housing

## Objective
Build a regression pipeline for predicting house prices.

In [None]:

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

data = fetch_california_housing(as_frame=True)
df = data.frame
df.head()


In [None]:

df.isnull().sum()


In [None]:

X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)


In [None]:

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mse, rmse, r2


In [None]:

pred_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
pred_df.to_csv('/mnt/data/california_predictions.csv', index=False)
pred_df.head()
