In [9]:
import pandas as pd
import numpy as np
import joblib
import json
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load data
df = pd.read_csv("../data/Bengaluru_House_Data.csv")
print("📦 Original rows:", df.shape)

# Keep only relevant columns
df = df[["total_sqft", "bath", "balcony", "size", "location", "price"]]

# Drop rows where crucial columns are missing
df = df.dropna(subset=["total_sqft", "size", "bath", "location", "price"])
print("✅ After dropna on required:", df.shape)

# Extract bhk (works for both "2 BHK" and "4 Bedroom")
def extract_bhk(x):
    try:
        return int(str(x).split(' ')[0])
    except:
        return None

df["bhk"] = df["size"].apply(extract_bhk)
df = df.dropna(subset=["bhk"])

# Convert total_sqft to float (handle "1130-1380" ranges)
def convert_sqft_to_num(x):
    try:
        x = str(x)
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df["total_sqft"] = df["total_sqft"].apply(convert_sqft_to_num)
df = df.dropna(subset=["total_sqft"])

# Final cleanup
df = df[["total_sqft", "bath", "balcony", "bhk", "location", "price"]]
df = df.dropna()
df = df[df["price"] < df["price"].quantile(0.95)]  # Remove extreme outliers

# One-hot encode locations
df["location"] = df["location"].str.strip()
df = pd.get_dummies(df, columns=["location"], drop_first=True)

# Prepare input and output
X = df.drop("price", axis=1)
y = df["price"]

print("✅ Final shape after processing:", X.shape, y.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("✅ R² Score:", r2_score(y_test, y_pred))

# Save model and columns
joblib.dump(model, "../backend/model.pkl")

# Save location names (for frontend dropdown)
location_columns = [col.replace("location_", "") for col in df.columns if col.startswith("location_")]
with open("../backend/columns.json", "w") as f:
    json.dump({"location_columns": location_columns}, f)

print("✅ model.pkl and columns.json saved successfully!")


📦 Original rows: (13320, 9)
✅ After dropna on required: (13246, 6)
✅ Final shape after processing: (12024, 1202) (12024,)
✅ R² Score: -6449511683961743.0
✅ model.pkl and columns.json saved successfully!
