In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install matplotlib
%pip install xgboost
%pip install seaborn
%pip install joblib
%pip install shap


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
import pandas as pd

# Load your CSV
df = pd.read_csv("House Price India.csv")  # Replace with actual path
pd.set_option("display.max_columns", None)

# Show top 5 rows
df.head()


In [None]:
# Data structure and types
df.info()

# Descriptive stats
df.describe().T

# Check for missing values
df.isnull().sum()


In [None]:
df.drop(columns=["id", "Date", "Postal Code"], inplace=True)  # can drop 'Date' if not using time-based analysis


In [None]:
# Unique values in object columns (if any)
for col in df.columns:
    if df[col].dtype == 'object':
        print(col, df[col].unique())


In [None]:
import numpy as np

# Create house age
df['house_age'] = 2025 - df['Built Year']

# Renovated or not
df['was_renovated'] = np.where(df['Renovation Year'] > 0, 1, 0)

# Total area (living + basement)
df['total_area'] = df['living area'] + df['Area of the basement']

# Price per sqft (just for EDA insight, don't include in X)
df['price_per_sqft'] = df['Price'] / df['living area']

df.drop(columns=['Built Year', 'Renovation Year', 'living area', 'Area of the basement'], inplace=True)


df.head()


In [None]:
# Define function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

# Columns to clean
cols_to_check = ['Price', 'total_area', 'number of bedrooms', 'lot area']

# Apply outlier removal
for col in cols_to_check:
    df = remove_outliers_iqr(df, col)

# Check new shape
print("Remaining rows after outlier removal:", df.shape[0])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1️⃣ Separate features and target
X = df.drop(columns=['Price', 'price_per_sqft'])  # Drop target and EDA-only feature
y = df['Price']

# 2️⃣ Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3️⃣ Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
def evaluate_model(model, name):
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    
    return {
        "Model": name,
        "R2 Score": round(r2, 4),
        "RMSE": round(rmse, 2),
        "MAE": round(mae, 2)
    }


In [None]:
models = [
    (LinearRegression(), "Linear Regression"),
    (Ridge(), "Ridge Regression"),
    (Lasso(), "Lasso Regression"),
    (RandomForestRegressor(n_estimators=100, random_state=42), "Random Forest"),
    (XGBRegressor(n_estimators=100, random_state=42, verbosity=0), "XGBoost")
]

results = []
for model, name in models:
    results.append(evaluate_model(model, name))

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values("R2 Score", ascending=False, inplace=True)

# Show table
results_df
results_df.to_csv("model_metrics.csv", index=False)


In [None]:
import os

# Create static directory for web app images
# This folder will hold the plots for the metrics page.
os.makedirs("static/images", exist_ok=True)

In [None]:
# This cell is already in your notebook, just add the savefig line.
plt.figure(figsize=(10,6))
sns.barplot(x='R2 Score', y='Model', data=results_df, hue='Model', palette='viridis', legend=False)
plt.title("Model Comparison – R² Score")
plt.xlabel("R² Score (Higher is Better)")
plt.grid(True)

# Add this line to save the figure
plt.savefig("static/images/r2_score_comparison.png", bbox_inches='tight')

plt.show()

In [None]:
# This cell is also in your notebook, just add the savefig line.
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.barplot(x='RMSE', y='Model', hue='Model', data=results_df, ax=axes[0], palette='magma', legend=False)
axes[0].set_title("Root Mean Squared Error")
axes[0].grid(True)

sns.barplot(x='MAE', y='Model', hue='Model', data=results_df, ax=axes[1], palette='coolwarm', legend=False)
axes[1].set_title("Mean Absolute Error")
axes[1].grid(True)

plt.tight_layout()

# Add this line to save the figure
plt.savefig("static/images/error_metrics_comparison.png", bbox_inches='tight')

plt.show()

In [None]:
import os
import joblib
# Create folder if it doesn't exist
os.makedirs("models", exist_ok=True)

# Now save the model
joblib.dump(models[4][0], "models/best_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")


In [None]:
from predictor import predict_with_explain

# Example input (same as earlier)
sample_input = {
    'number of bedrooms': 4.0,
    'number of bathrooms': 2.5,
    'lot area': 5000.0,
    'number of floors': 1.5,
    'waterfront present': 0,
    'number of views': 2,
    'condition of the house': 4,
    'grade of the house': 8,
    'Area of the house(excluding basement)': 2000.0,
    'Lattitude': 52.9,
    'Longitude': -114.5,
    'living_area_renov': 1800.0,
    'lot_area_renov': 4900.0,
    'Number of schools nearby': 2,
    'Distance from the airport': 45.0,
    'house_age': 25,
    'was_renovated': 1,
    'total_area': 2200.0
}

# Predict and get explanations
predicted_price, explanation, shap_html = predict_with_explain(sample_input)

print(f"Predicted Price: ₹{predicted_price}")
print("\nTop Contributing Factors:")
for factor in explanation:
    print(f"- {factor['Feature']}: {factor['Impact']:.2f}")

# Note: The SHAP plot HTML is also generated in 'shap_html' but is best viewed in the browser.