In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df = pd.read_csv("/Users/Tanish/Documents/House Price Prediction/data/Pune_House_Data.csv")

# Drop columns with too many missing values
df.drop(columns=["society"], inplace=True)  # Too many NaNs

# Fill missing values
df["bath"].fillna(df["bath"].median(), inplace=True)
df["balcony"].fillna(df["balcony"].median(), inplace=True)
df.dropna(subset=["size", "site_location"], inplace=True)

# Convert 'size' (e.g., "2 BHK") to integer number of bedrooms
df["BHK"] = df["size"].apply(lambda x: int(str(x).split(" ")[0]))
df.drop(columns=["size"], inplace=True)

# Convert 'total_sqft' to a numeric value
def convert_sqft(value):
    try:
        if '-' in value:  # If range, take average
            sqft_range = list(map(float, value.split('-')))
            return np.mean(sqft_range)
        elif value.isnumeric():
            return float(value)
        else:
            return np.nan  # Handle unexpected cases
    except:
        return np.nan

df["total_sqft"] = df["total_sqft"].apply(convert_sqft)
df.dropna(subset=["total_sqft"], inplace=True)

# Select features and target
features = ["total_sqft", "bath", "balcony", "BHK", "area_type", "site_location"]
target = "price"
X = df[features]
y = df[target]

# Define preprocessing pipeline
num_features = ["total_sqft", "bath", "balcony", "BHK"]
cat_features = ["area_type", "site_location"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Define model pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Save model
with open("house_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model training complete. Saved as house_price_model.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["bath"].fillna(df["bath"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["balcony"].fillna(df["balcony"].median(), inplace=True)


Model training complete. Saved as house_price_model.pkl


In [7]:
import pickle
model = pickle.load(open('/Users/Tanish/Documents/House Price Prediction/models/house_price_model.pkl', 'rb'))

# Create a DataFrame with the same structure as the training data
input_data = pd.DataFrame([[1200, 2, 1, 2, 'Built-up  Area', 'Wakad']], columns=features)

# Make predictions
pred = model.predict(input_data)
print(pred)

[59.66811667]
