In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import VotingRegressor

# Load the dataset
file_path = '/content/_All_Cities_Cleaned.csv'  # Update this with the correct path if needed
df = pd.read_csv(file_path)

# Display dataset information
print("Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns)
df.head()


In [None]:
# Handle missing values (if any)
df.fillna(0, inplace=True)

# Feature Engineering
df['price_per_sqft'] = df['price'] / df['area']  # Price per square foot
df['log_area'] = np.log1p(df['area'])           # Log-transform area
df['log_price'] = np.log1p(df['price'])         # Log-transform price

# Features and target
X = df[['area', 'bedroom', 'bathroom', 'price_per_sqft', 'log_area', 'layout_type', 'property_type', 'furnish_type', 'city']]
y = df['log_price']  # Using log-transformed price as the target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets successfully!")


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor

# Define numerical and categorical features
num_features = ['area', 'bedroom', 'bathroom', 'price_per_sqft', 'log_area']  # Example numerical columns
cat_features = ['layout_type', 'property_type', 'furnish_type', 'city']       # Example categorical columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),  # Scale numerical features
        ('cat', OneHotEncoder(drop='first'), cat_features)  # Encode categorical features
    ]
)

# Verify preprocessing works on data
X_train_processed = preprocessor.fit_transform(X_train)
print("Preprocessed training data shape:", X_train_processed.shape)


In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2
)

# Pipeline
rf_model = Pipeline(steps=[('preprocessor', preprocessor), ('model', grid_search_rf)])
rf_model.fit(X_train, y_train)

# Best parameters
best_rf_params = grid_search_rf.best_params_
print("Best Random Forest Parameters:", best_rf_params)


In [None]:
# Hyperparameter grid
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid search
grid_search_xgb = GridSearchCV(
    estimator=xgb.XGBRegressor(random_state=42),
    param_grid=param_grid_xgb,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2
)

# Pipeline
xgb_model = Pipeline(steps=[('preprocessor', preprocessor), ('model', grid_search_xgb)])
xgb_model.fit(X_train, y_train)

# Best parameters
best_xgb_params = grid_search_xgb.best_params_
print("Best XGBoost Parameters:", best_xgb_params)


In [None]:
# Define the ensemble model with preprocessing
ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Add preprocessing
    ('ensemble', VotingRegressor(
        estimators=[
            ('random_forest', grid_search_rf.best_estimator_),
            ('xgboost', grid_search_xgb.best_estimator_)
        ]
    ))
])

# Train the ensemble model
ensemble_pipeline.fit(X_train, y_train)

print("Ensemble model trained successfully!")

In [None]:
# Train the ensemble pipeline
ensemble_pipeline.fit(X_train, y_train)
print("Ensemble pipeline trained successfully!")

# Define the evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print("Model Evaluation Metrics:")
    print(f"Root Mean Squared Error (RMSE): ₹{rmse:.2f}")
    print(f"R² Score: {r2:.4f}")

# Evaluate the trained pipeline
evaluate_model(ensemble_pipeline, X_test, y_test)
