In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import os

# Set random seed for reproducibility
np.random.seed(42)

# Define the path to your dataset using raw string
dataset_path = r'C:\Users\Arya\Desktop\Bengaluru_House_Data.csv'
# Load the dataset
df = pd.read_csv(dataset_path)
  

In [2]:
 
    # Data preprocessing
    print("Dataset Shape:", df.shape)
    print("\nFirst few rows:")
    print(df.head())
    
    # Print column names to see what's available
    print("\nAvailable columns:")
    print(df.columns.tolist())
    
    # Handle missing values
    print("\nMissing values before preprocessing:")
    print(df.isnull().sum())
    
    # Drop rows with missing values in price (target variable)
    df = df.dropna(subset=['price'])
    
    # Fill missing values in other columns
    df['size'] = df['size'].fillna(df['size'].mode()[0])
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['balcony'] = df['balcony'].fillna(df['balcony'].median())
    
    # Feature engineering
    # Convert size to numeric if it's not already
    if df['size'].dtype == 'object':
        # Remove any non-numeric characters and convert to float
        df['size'] = df['size'].str.replace(r'[^\d.]', '', regex=True).astype(float)
    else:
        # If already numeric, ensure it's float type
        df['size'] = df['size'].astype(float)
    
    # Create new features
    df['price_per_sqft'] = df['price'] / df['size']
    
    # Drop unnecessary columns if they exist
    columns_to_drop = ['society', 'availability']
    existing_columns = [col for col in columns_to_drop if col in df.columns]
    if existing_columns:
        df = df.drop(existing_columns, axis=1)
    
    # Prepare data for modeling
    X = df.drop('price', axis=1)
    y = df['price']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical columns
    numerical_cols = ['size', 'bath', 'balcony', 'price_per_sqft']
    categorical_cols = ['area_type', 'location']

Dataset Shape: (13320, 9)

First few rows:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  

Available columns:
['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'price']

Missing values before preprocessing:
area_type          0
availability      

In [3]:
# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

In [4]:
# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
    
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
    
    # Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
    

In [5]:
# Define hyperparameter grids for each model
param_grids = {
        'Random Forest': {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        },
        'XGBoost': {
            'model__n_estimators': [100, 200, 300],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7],
            'model__subsample': [0.8, 0.9, 1.0]
        },
        'SVR': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf'],
            'model__gamma': ['scale', 'auto']
        },
        'Neural Network': {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'model__activation': ['relu', 'tanh'],
            'model__alpha': [0.0001, 0.001, 0.01]
        }
    }
    

In [6]:
 # Define base models
base_models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42),
        'SVR': SVR(),
        'Neural Network': MLPRegressor(random_state=42, max_iter=1000)
    }
    
    # Train and evaluate models with hyperparameter tuning
results = {}
best_models = {}
    
for name, model in base_models.items():
        print(f"\nTuning {name}...")
        
        # Create pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)])
        


Tuning Random Forest...

Tuning XGBoost...

Tuning SVR...

Tuning Neural Network...


In [None]:
 # Create GridSearchCV object
grid_search = GridSearchCV(
            pipeline,
            param_grids[name],
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        # Fit the model
grid_search.fit(X_train, y_train)
        
        # Store the best model
best_models[name] = grid_search.best_estimator_
        
        # Make predictions
y_pred = grid_search.predict(X_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
# Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'MSE': mse,
            'RMSE': np.sqrt(mse),
            'R2': r2,
            'Best Parameters': grid_search.best_params_,
            'Training Time (s)': time.time() - start_time
        }
        
        print(f"\n{name} Results:")
        print(f"  Best Parameters: {grid_search.best_params_}")
        print(f"  MSE: {mse:.2f}")
        print(f"  RMSE: {np.sqrt(mse):.2f}")
        print(f"  R2 Score: {r2:.2f}")
        print(f"  Training Time: {time.time() - start_time:.2f} seconds")
    

In [None]:
# Visualize results
    print("\nCreating visualizations...")
    metrics_df = pd.DataFrame(results).T
    
    # Create a larger figure
    plt.figure(figsize=(12, 6))
    

In [None]:
# Create the bar plot
    metrics_df['R2'].plot(kind='bar', color='skyblue')
    
    # Customize the plot
    plt.title('R2 Scores for Different Models (After Hyperparameter Tuning)', fontsize=14, pad=20)
    plt.ylabel('R2 Score', fontsize=12)
    plt.xlabel('Models', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add value labels on top of each bar
    for i, v in enumerate(metrics_df['R2']):
        plt.text(i, v, f'{v:.3f}', ha='center', va='bottom')
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    # Show the plot
    plt.show()

In [None]:
  # Print detailed results
    print("\nDetailed Results:")
    print(metrics_df)
    
    # Print best parameters for each model
    print("\nBest Parameters for Each Model:")
    for name, result in results.items():
        print(f"\n{name}:")
        print(result['Best Parameters'])