In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [None]:
df =pd.read_csv("/content/Mumbai House Prices.csv")
df.info()
df.head()

In [None]:
df.duplicated()

In [None]:
cate_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if df[col].dtype != 'object']

print("Categorical columns :", cate_col)
print("Numerical columns :", num_col)

In [None]:
df[cate_col].nunique()

In [None]:
df.isnull().sum()

In [None]:
df = pd.read_csv("/content/Mumbai House Prices.csv")


X = df[["bhk", "type"]]
y_price = df["price"]
y_status = df["status"]
y_region = df["region"]
y_locality = df["locality"]
y_age = df["age"]
y_area = df["area"]

# Categorical transformer for "Type"
preprocessor = ColumnTransformer(
    transformers=[('type', OneHotEncoder(handle_unknown='ignore'), ["type"])],
    remainder='passthrough'
)

In [None]:
import re
import numpy as np
import pandas as pd

def age_to_numeric(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    s_low = s.lower()
    if s_low in ("new", "brand new"):          # treat "New" as 0 years
        return 0.0
    # range like "1-5" -> midpoint 3.0
    m = re.search(r'(\d+)\s*-\s*(\d+)', s)
    if m:
        a, b = float(m.group(1)), float(m.group(2))
        return (a + b) / 2.0
    # plus like "10+" -> use 10
    m = re.search(r'(\d+)\+', s)
    if m:
        return float(m.group(1))
    # single number like "5" -> 5
    m = re.search(r'(\d+)', s)
    if m:
        return float(m.group(1))
    # fallback
    return np.nan

# Apply to your dataframe (example)
df['age_num'] = df['age'].apply(age_to_numeric)

# Check how many missing after conversion
print("Converted age -> numeric; NaNs:", df['age_num'].isna().sum())

# If many NaNs, you can impute median before training:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='median')
df['age_num'] = imp.fit_transform(df[['age_num']])

# Now use df['age_num'] as y_age for regression:
y_age = df['age_num']

# Example: fit the age regression pipeline (assuming X defined)
pipe_age = Pipeline([('pre', preprocessor), ('model', LinearRegression())])
pipe_age.fit(X, y_age)

In [None]:
# Regression for Price
pipe_price = Pipeline([('pre', preprocessor), ('model', LinearRegression())])
pipe_price.fit(X, y_price)

# Regression for Area
pipe_area = Pipeline([('pre', preprocessor), ('model', LinearRegression())])
pipe_area.fit(X, y_area)

# Regression for Age
pipe_age = Pipeline([('pre', preprocessor), ('model', LinearRegression())])
pipe_age.fit(X, y_age)

# Classification for Status
pipe_status = Pipeline([('pre', preprocessor), ('model', LogisticRegression(max_iter=500))])
pipe_status.fit(X, y_status)

# Classification for Region
pipe_region = Pipeline([('pre', preprocessor), ('model', LogisticRegression(max_iter=500))])
pipe_region.fit(X, y_region)

# Classification for Locality
pipe_locality = Pipeline([('pre', preprocessor), ('model', LogisticRegression(max_iter=500))])
pipe_locality.fit(X, y_locality)
