In [None]:
import numpy as np
import pandas as pd
import pickle
import sklearn.datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
house_dataset=pd.read_csv("Bengaluru_House_Data.csv")

In [None]:
house_dataset.head()

In [None]:
house_dataset=house_dataset.drop(['area_type','availability','society'], axis=1)

In [None]:
house_dataset.head()

In [None]:
house_dataset.shape

In [None]:
house_dataset.isnull().sum()

In [None]:
house_dataset = house_dataset.dropna()
house_dataset.isnull().sum()

In [None]:
house_dataset.describe()

In [None]:
# 1. Clean 'size' column
house_dataset['size'] = house_dataset['size'].astype(str).str.extract('(\d+)').astype(float)

# 2. Clean 'total_sqft' column
def convert_sqft_to_num(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

house_dataset['total_sqft'] = house_dataset['total_sqft'].apply(convert_sqft_to_num)

# 3. Drop rows with remaining nulls (after conversion)
house_dataset.dropna(inplace=True)

house_dataset['location'] = house_dataset['location'].fillna('Unknown')

# Convert to numeric labels
label_encoder = LabelEncoder()
house_dataset['location'] = label_encoder.fit_transform(house_dataset['location'])


In [None]:
with open('location_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:
X=house_dataset.drop(['price'], axis=1)
Y=house_dataset['price']

In [None]:
print(X)
print(Y)

In [None]:
X.head()

In [None]:
X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.2,random_state=45)

In [None]:
print(X.shape,X_train.shape, X_test.shape)

In [None]:
#model=XGBRegressor()
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

In [None]:
model.fit(X_train, Y_train)

In [None]:
train_predict=model.predict(X_train)


In [None]:
print(train_predict)

In [None]:
#R square error
score1=metrics.r2_score(Y_train,train_predict)
#mean absolute error
score2=metrics.mean_absolute_error(Y_train,train_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

In [None]:
test_predict=model.predict(X_test)


In [None]:
score1=metrics.r2_score(Y_test,test_predict)
score2=metrics.mean_absolute_error(Y_test,test_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

In [None]:
# Load the encoder
with open('location_label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)



In [None]:
# Sample input for prediction (make sure feature names and order match your training data)
sample_data = {
    'location': label_encoder.transform(['Bisuvanahalli'])[0],
    'size': 3,
    'total_sqft': 1180,
    'bath': 3,
    'balcony': 2
}


# Convert to DataFrame (shape must match training data used for the model)
sample_df = pd.DataFrame([sample_data])

# Predict the price
predicted_price = model.predict(sample_df)
print(f"Predicted house price: ₹{predicted_price[0]:,.2f}")


In [None]:
pickle.dump(model,open('mlmodel.pkl','wb'))
mlmode=pickle.load(open('mlmodel.pkl','rb'))