<a href="https://colab.research.google.com/github/Thanishka2727/-House-Price-Prediction/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib  # For saving the model

# Step 1: Load the dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Display the first few rows of the dataset
print("Dataset Head:")
print(df.head())

# Step 2: Define features and target variable
X = df.drop(columns=['MedHouseVal'])  # Features
y = df['MedHouseVal']  # Target variable (Median House Value)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Identify numerical columns (No categorical columns in this dataset)
numerical_cols = X_train.columns

# Step 5: Data Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols)
])

# Step 6: Define the model (Random Forest Regressor)
model = RandomForestRegressor(random_state=42)

# Step 7: Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Step 8: Train the model
print("Training the model...")
pipeline.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = pipeline.predict(X_test)

# Step 10: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nInitial Model Performance:")
print(f'Mean Squared Error: {mse:.2f}')
print(f'R² Score: {r2:.4f}')

# Step 11: Hyperparameter tuning using RandomizedSearchCV
print("\nStarting hyperparameter tuning...")
param_dist = {
    'model__n_estimators': [100, 200],  # Number of trees in the forest
    'model__max_depth': [None, 10, 20],  # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10]  # Minimum samples required to split a node
}

# Use RandomizedSearchCV with fewer iterations and parallel processing
random_search = RandomizedSearchCV(
    pipeline,
    param_dist,
    n_iter=5,  # Number of parameter settings sampled
    cv=3,  # Number of cross-validation folds
    scoring='r2',  # Evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Step 12: Get the best model and evaluate it
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\nBest Model Performance:")
print(f'Best Model Mean Squared Error: {mse_best:.2f}')
print(f'Best Model R² Score: {r2_best:.4f}')
print(f'Best Parameters: {random_search.best_params_}')

# Step 13: Save the trained model
joblib.dump(best_model, "house_price_model.pkl")
print("\nModel saved as house_price_model.pkl")

Dataset Head:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  
Training the model...

Initial Model Performance:
Mean Squared Error: 0.26
R² Score: 0.8053

Starting hyperparameter tuning...

Best Model Performance:
Best Model Mean Squared Error: 0.25
Best Model R² Score: 0.8063
Best Parameters: {'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__max_depth': None}

Model saved as house_price_model.pkl
