In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np

df=pd.read_csv("housing.csv")
print(df.head())

# replace missing values with the median of the column
df=df.fillna(df.median(numeric_only=True))
# the column ocean_proximity contains text categories
# Since the model only understands numbers, we convert those into dummy variables (0/1 columns) using one-hot encoding.

df=pd.get_dummies(df,columns=["ocean_proximity"],drop_first=True)
# split data into features and target
X=df.drop("median_house_value",axis=1)
y=df["median_house_value"]

# split the data - 80% -> training , 20% -> testing , random state=42 makes the split reproducible
X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)

model=LinearRegression()
model.fit(X_train,y_train)
# make prediction using test set
y_pred=model.predict(X_test)
# evaluate performance
# rmse - smaller is better
rmse=mean_squared_error(y_test,y_pred,squared=False)
# r2 score closer to 1 better fit
r2=r2_score(y_test,y_pred)
print("Model Performance :")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")
# splits the data into 5 parts (folds).shuffle=True makes sure the data is randomly mixed before splitting.

print("Performing 5-Fold Cross Validation..")
kf=KFold(n_splits=5,shuffle=True,random_state=42)

# cross_val_score() — automatically performs training and testing for each fold.

# We use "neg_mean_squared_error" because sklearn’s cross-validation expects “higher is better”, so it returns negative errors.
# We fix that with np.sqrt(-cv_scores) to get positive RMSE values.

cv_scores=cross_val_score(
    model,X,y,scoring="neg_mean_squared_error",cv=kf
)

cv_rmse=np.sqrt(-cv_scores)
print("Cross-Validation RMSE for each fold:", np.round(cv_rmse, 2))
print(f"Average CV RMSE: {cv_rmse.mean():.2f}")

# Show actual vs predicted values
comparison = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print("\nActual vs Predicted:")
print(comparison.head())





   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
Model Performance :
RMSE: 70060.52
R² Score: 0.6254
Performing 5-Fold Cross

