In [1]:
import sys
import matplotlib, sklearn, xgboost, pandas

print("‚úÖ Python path:", sys.executable)
print("üì¶ matplotlib:", matplotlib.__version__)
print("üì¶ sklearn:", sklearn.__version__)
print("üì¶ xgboost:", xgboost.__version__)


‚úÖ Python path: C:\Users\akkap\house-price-prediction\venv\Scripts\python.exe
üì¶ matplotlib: 3.10.7
üì¶ sklearn: 1.7.2
üì¶ xgboost: 3.1.1


In [2]:
# ================================================
# STEP 1: Load combined dataset
# ================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import os
import re

# Load data
data_path = "../data/combined_data.csv"
df = pd.read_csv(data_path)

print("‚úÖ Data loaded successfully!")
print("Shape:", df.shape)
df.head()


‚úÖ Data loaded successfully!
Shape: (46686, 143)


  df = pd.read_csv(data_path)


Unnamed: 0,feature_9,feature_78,feature_61,number_of_views,feature_20,clubhouse,indoorgames,feature_28,feature_29,feature_36,...,feature_54,distance_from_the_airport,feature_33,feature_58,feature_10,feature_72,feature_32,landscapedgardens,feature_41,resale
0,Land Contour,Yr Sold,Garage Finish,,Year Built,,,Exter Qual,Exter Cond,BsmtFin Type 2,...,Kitchen Qual,,Bsmt Exposure,Fireplace Qu,Utilities,Pool Area,Bsmt Cond,,Heating QC,
1,Lvl,2010,Fin,,1960,,,TA,TA,Unf,...,TA,,Gd,Gd,AllPub,0,Gd,,Fa,
2,Lvl,2010,Unf,,1961,,,TA,TA,LwQ,...,TA,,No,,AllPub,0,TA,,TA,
3,Lvl,2010,Unf,,1958,,,TA,TA,Unf,...,Gd,,No,,AllPub,0,TA,,TA,
4,Lvl,2010,Fin,,1968,,,Gd,TA,Unf,...,Ex,,No,TA,AllPub,0,TA,,Ex,


In [3]:
# ================================================
# STEP 2: Inspect and clean data
# ================================================

print(f"Original shape: {df.shape}")

# Drop columns with too many missing values (>40%)
threshold = 0.4 * len(df)
df = df.dropna(axis=1, thresh=threshold)
print(f"After dropping high-missing columns: {df.shape}")

# Drop duplicates
df = df.drop_duplicates()

# Confirm target column exists
if 'sale_price' not in df.columns:
    raise ValueError("‚ùå Target column 'sale_price' not found!")

# üßπ Clean and convert sale_price
def clean_price(value):
    if pd.isna(value):
        return np.nan
    s = str(value).lower().strip()
    s = re.sub(r"[‚Çπ$,]", "", s)
    s = s.replace("rs", "").replace("inr", "").replace("usd", "")
    s = s.replace("price", "").replace(" ", "")
    if "lac" in s or "l" in s:
        num = re.findall(r"\d+\.?\d*", s)
        return float(num[0]) * 1e5 if num else np.nan
    if "cr" in s or "crore" in s:
        num = re.findall(r"\d+\.?\d*", s)
        return float(num[0]) * 1e7 if num else np.nan
    try:
        return float(s)
    except:
        return np.nan

# Remove header-like text rows (e.g. "SalePrice")
df = df[~df['sale_price'].astype(str).str.contains("saleprice", case=False, na=False)]

df["sale_price"] = df["sale_price"].apply(clean_price)

# Filter valid numeric prices
df = df.dropna(subset=["sale_price"])
df = df[df["sale_price"] > 1000]

print("‚úÖ Cleaned sale_price successfully.")
print(df["sale_price"].describe())


Shape: (46686, 143)
Columns: ['feature_9', 'feature_78', 'feature_61', 'number_of_views', 'feature_20', 'clubhouse', 'indoorgames', 'feature_28', 'feature_29', 'feature_36', 'feature_60', 'carparking', 'rainwaterharvesting', 'school', '24x7security', 'feature_43', 'feature_38', 'feature_66', 'feature_56', 'feature_73']


  df = pd.read_csv("../data/combined_data.csv")


Unnamed: 0,feature_9,feature_78,feature_61,number_of_views,feature_20,clubhouse,indoorgames,feature_28,feature_29,feature_36,...,feature_54,distance_from_the_airport,feature_33,feature_58,feature_10,feature_72,feature_32,landscapedgardens,feature_41,resale
0,Land Contour,Yr Sold,Garage Finish,,Year Built,,,Exter Qual,Exter Cond,BsmtFin Type 2,...,Kitchen Qual,,Bsmt Exposure,Fireplace Qu,Utilities,Pool Area,Bsmt Cond,,Heating QC,
1,Lvl,2010,Fin,,1960,,,TA,TA,Unf,...,TA,,Gd,Gd,AllPub,0,Gd,,Fa,
2,Lvl,2010,Unf,,1961,,,TA,TA,LwQ,...,TA,,No,,AllPub,0,TA,,TA,
3,Lvl,2010,Unf,,1958,,,TA,TA,Unf,...,Gd,,No,,AllPub,0,TA,,TA,
4,Lvl,2010,Fin,,1968,,,Gd,TA,Unf,...,Ex,,No,TA,AllPub,0,TA,,Ex,


In [4]:
# ================================================
# STEP 3: Split features (X) and target (y)
# ================================================
if 'sale_price' not in df.columns:
    raise ValueError("‚ùå 'sale_price' column missing!")

X = df.drop(columns=['sale_price'], errors='ignore')
y = df['sale_price'].astype(float)

print("‚úÖ Feature matrix shape:", X.shape)
print("‚úÖ Target vector shape:", y.shape)


üßπ Missing values summary:
feature_73    46672
feature_75    46579
feature_7     46487
feature_74    46113
feature_26    45530
feature_58    45177
feature_4     44245
feature_64    43914
feature_61    43914
feature_60    43914
feature_65    43914
feature_59    43912
feature_33    43838
feature_36    43836
feature_34    43835
feature_32    43835
feature_31    43835
feature_27    43778
feature_49    43757
feature_48    43757
dtype: int64

Remaining columns after dropping high-missing ones: 40
After dropping duplicates: (33069, 40)
‚úÖ After cleaning invalid prices: (33068, 40)

üìä sale_price column info:
count    3.306800e+04
mean     1.058902e+07
std      2.271744e+07
min      1.278900e+04
25%      3.300000e+06
50%      6.000000e+06
75%      1.100000e+07
max      8.546000e+08
Name: sale_price, dtype: float64


In [5]:
# ================================================
# STEP 4: Train/Test Split
# ================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("‚úÖ Train:", X_train.shape, "Test:", X_test.shape)
print("‚úÖ y_train dtype:", y_train.dtype)


Numeric columns: 38
Categorical columns: 1


In [6]:
# ================================================
# STEP 5: Preprocessing Setup
# ================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


["feature_9,feature_78,feature_61,number_of_views,feature_20,clubhouse,indoorgames,feature_28,feature_29,feature_36,feature_60,carparking,rainwaterharvesting,school,24x7security,feature_43,feature_38,feature_66,feature_56,feature_73,gasconnection,feature_49,wifi,built_year,number_of_floors,powerbackup,lattitude,atm,feature_51,feature_35,feature_52,sale_price,feature_76,feature_18,feature_26,feature_64,no._of_bedrooms,longitude,feature_0,feature_8,washingmachine,feature_11,renovation_year,feature_53,children'splayarea,sportsfacility,feature_25,area_of_the_basement,feature_40,feature_1,shoppingmall,location,condition_of_the_house,feature_62,feature_80,hospital,feature_63,feature_59,swimmingpool,grade_of_the_house,tv,living_area,gymnasium,ac,liftavailable,feature_71,maintenancestaff,feature_2,feature_74,cafeteria,feature_65,number_of_bedrooms,feature_79,lot_area,id,postal_code,multipurposeroom,feature_30,staffquarter,living_area_renov,lot_area_renov,microwave,feature_7,feature_6,feature_1

In [7]:
# ================================================
# STEP 6: Model + Pipeline
# ================================================
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


Shape: (46686, 1)
Columns: ["feature_9,feature_78,feature_61,number_of_views,feature_20,clubhouse,indoorgames,feature_28,feature_29,feature_36,feature_60,carparking,rainwaterharvesting,school,24x7security,feature_43,feature_38,feature_66,feature_56,feature_73,gasconnection,feature_49,wifi,built_year,number_of_floors,powerbackup,lattitude,atm,feature_51,feature_35,feature_52,sale_price,feature_76,feature_18,feature_26,feature_64,no._of_bedrooms,longitude,feature_0,feature_8,washingmachine,feature_11,renovation_year,feature_53,children'splayarea,sportsfacility,feature_25,area_of_the_basement,feature_40,feature_1,shoppingmall,location,condition_of_the_house,feature_62,feature_80,hospital,feature_63,feature_59,swimmingpool,grade_of_the_house,tv,living_area,gymnasium,ac,liftavailable,feature_71,maintenancestaff,feature_2,feature_74,cafeteria,feature_65,number_of_bedrooms,feature_79,lot_area,id,postal_code,multipurposeroom,feature_30,staffquarter,living_area_renov,lot_area_renov,microwave,fe

In [8]:
# ================================================
# STEP 7: Train Model
# ================================================
print("üöÄ Training model...")
pipeline.fit(X_train, y_train)
print("‚úÖ Training complete!")


Shape: (46686, 143)
Columns: ['feature_9', 'feature_78', 'feature_61', 'number_of_views', 'feature_20', 'clubhouse', 'indoorgames', 'feature_28', 'feature_29', 'feature_36', 'feature_60', 'carparking', 'rainwaterharvesting', 'school', '24x7security', 'feature_43', 'feature_38', 'feature_66', 'feature_56', 'feature_73', 'gasconnection', 'feature_49', 'wifi', 'built_year', 'number_of_floors', 'powerbackup', 'lattitude', 'atm', 'feature_51', 'feature_35', 'feature_52', 'sale_price', 'feature_76', 'feature_18', 'feature_26', 'feature_64', 'no._of_bedrooms', 'longitude', 'feature_0', 'feature_8', 'washingmachine', 'feature_11', 'renovation_year', 'feature_53', "children'splayarea", 'sportsfacility', 'feature_25', 'area_of_the_basement', 'feature_40', 'feature_1', 'shoppingmall', 'location', 'condition_of_the_house', 'feature_62', 'feature_80', 'hospital', 'feature_63', 'feature_59', 'swimmingpool', 'grade_of_the_house', 'tv', 'living_area', 'gymnasium', 'ac', 'liftavailable', 'feature_71', 

  df = pd.read_csv("../data/combined_data.csv")


Unnamed: 0,feature_9,feature_78,feature_61,number_of_views,feature_20,clubhouse,indoorgames,feature_28,feature_29,feature_36,...,feature_54,distance_from_the_airport,feature_33,feature_58,feature_10,feature_72,feature_32,landscapedgardens,feature_41,resale
0,Land Contour,Yr Sold,Garage Finish,,Year Built,,,Exter Qual,Exter Cond,BsmtFin Type 2,...,Kitchen Qual,,Bsmt Exposure,Fireplace Qu,Utilities,Pool Area,Bsmt Cond,,Heating QC,
1,Lvl,2010,Fin,,1960,,,TA,TA,Unf,...,TA,,Gd,Gd,AllPub,0,Gd,,Fa,
2,Lvl,2010,Unf,,1961,,,TA,TA,LwQ,...,TA,,No,,AllPub,0,TA,,TA,
3,Lvl,2010,Unf,,1958,,,TA,TA,Unf,...,Gd,,No,,AllPub,0,TA,,TA,
4,Lvl,2010,Fin,,1968,,,Gd,TA,Unf,...,Ex,,No,TA,AllPub,0,TA,,Ex,


In [9]:
# ================================================
# STEP 8: Evaluate Model
# ================================================
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"üìä RMSE: {rmse:.2f}")
print(f"üìà R¬≤: {r2:.3f}")

plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()


In [10]:
# ================================================
# STEP 9: Save and Test Model
# ================================================
model_path = "../models/pipeline.joblib"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(pipeline, model_path)

print(f"üíæ Model saved successfully at: {model_path}")

# Test loading and prediction
loaded_model = joblib.load(model_path)
sample = X_test.iloc[0:1]
pred_price = loaded_model.predict(sample)[0]
print("üß† Sample Prediction:", round(pred_price, 2))


Dataset shape: (46686, 143)
Columns list:
['feature_9', 'feature_78', 'feature_61', 'number_of_views', 'feature_20', 'clubhouse', 'indoorgames', 'feature_28', 'feature_29', 'feature_36', 'feature_60', 'carparking', 'rainwaterharvesting', 'school', '24x7security', 'feature_43', 'feature_38', 'feature_66', 'feature_56', 'feature_73', 'gasconnection', 'feature_49', 'wifi', 'built_year', 'number_of_floors', 'powerbackup', 'lattitude', 'atm', 'feature_51', 'feature_35', 'feature_52', 'sale_price', 'feature_76', 'feature_18', 'feature_26', 'feature_64', 'no._of_bedrooms', 'longitude', 'feature_0', 'feature_8', 'washingmachine', 'feature_11', 'renovation_year', 'feature_53', "children'splayarea", 'sportsfacility', 'feature_25', 'area_of_the_basement', 'feature_40', 'feature_1']


In [11]:
# ================================================
# STEP 5: Split data for training and validation
# ================================================

X = df.drop(columns=["sale_price"])
y = df["sale_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (37348, 142), Test set: (9338, 142)


In [12]:
print("Numeric columns:", len(numeric_features), numeric_features[:10])
print("Categorical columns:", len(categorical_features), categorical_features[:10])


Numeric columns: 60 ['number_of_views', 'clubhouse', 'indoorgames', 'carparking', 'rainwaterharvesting', 'school', '24x7security', 'gasconnection', 'wifi', 'built_year']
Categorical columns: 83 ['feature_9', 'feature_78', 'feature_61', 'feature_20', 'feature_28', 'feature_29', 'feature_36', 'feature_60', 'feature_43', 'feature_38']


In [13]:
# ================================================
# STEP 5: Split data into features and target
# ================================================

# Confirm sale_price exists
if 'sale_price' not in df.columns:
    raise ValueError("‚ùå 'sale_price' column missing!")

# Separate features (X) and target (y)
X = df.drop(columns=['sale_price'])
y = df['sale_price']

print("‚úÖ Feature matrix shape:", X.shape)
print("‚úÖ Target vector shape:", y.shape)



‚úÖ Feature matrix shape: (46686, 142)
‚úÖ Target vector shape: (46686,)


In [14]:
# ================================================
# STEP 4: Split features and target
# ================================================
if 'sale_price' not in df.columns:
    raise ValueError("‚ùå Target column 'sale_price' not found in dataset!")

X = df.drop(columns=['sale_price'], errors='ignore')
y = df['sale_price']

print("‚úÖ Feature matrix:", X.shape)
print("‚úÖ Target vector:", y.shape)

‚úÖ Feature matrix: (46686, 142)
‚úÖ Target vector: (46686,)


In [15]:
# ================================================
# STEP 5: Split Train/Test
# ================================================
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (37348, 142) Test: (9338, 142)


In [16]:
# ================================================
# STEP 6: Preprocessing (Based ONLY on X_train)
# ================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

Numeric features: 60
Categorical features: 82


In [19]:
# ================================================
# STEP 7: Model + Pipeline
# ================================================
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [20]:
# ================================================
# STEP: Clean sale_price column
# ================================================
if 'sale_price' not in df.columns:
    raise ValueError("‚ùå 'sale_price' column not found in dataset!")

print("Before cleaning:", df['sale_price'].head(10))

# Remove currency symbols, commas, spaces, and text
df['sale_price'] = (
    df['sale_price']
    .astype(str)
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and decimal points
    .replace('', np.nan)
)

# Convert to numeric (coerce errors to NaN)
df['sale_price'] = pd.to_numeric(df['sale_price'], errors='coerce')

# Drop rows where price couldn't be converted
df = df.dropna(subset=['sale_price'])

# Filter out unrealistic values
df = df[df['sale_price'] > 1000]

print("‚úÖ Cleaned and converted sale_price.")
print(df['sale_price'].describe())


Before cleaning: 0    SalePrice
1       215000
2       105000
3       172000
4       244000
5       189900
6       195500
7       213500
8       191500
9       236500
Name: sale_price, dtype: object
‚úÖ Cleaned and converted sale_price.
count    4.668500e+04
mean     7.639496e+06
std      1.966485e+07
min      1.278900e+04
25%      5.157000e+05
50%      3.769000e+06
75%      8.358000e+06
max      8.546000e+08
Name: sale_price, dtype: float64


In [21]:
# ================================================
# STEP 8: Fit
# ================================================
print("üöÄ Training model...")
pipeline.fit(X_train, y_train)
print("‚úÖ Training complete!")

üöÄ Training model...


ValueError: could not convert string to float: 'SalePrice'

In [None]:
# ================================================
# STEP 7: Evaluate performance
# ================================================
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R¬≤: {r2:.3f}")

# Optional visualization
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()


In [None]:
# ================================================
# STEP 8: Save model
# ================================================

model_path = "../models/pipeline.joblib"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(pipeline, model_path)

print(f"üíæ Model saved successfully at: {model_path}")


In [None]:
# ================================================
# STEP 9: Test saved model
# ================================================
loaded_model = joblib.load(model_path)
sample = X_test.iloc[0:1]
pred_price = loaded_model.predict(sample)[0]
print("üß† Sample Prediction:", round(pred_price, 2))
