In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [3]:
#Loading the dataset
df1 = pd.read_csv('Bengaluru_House_Data.csv')

In [4]:
df1.head()



In [5]:
df1.info()



In [6]:
df1.shape



#### Droping features theat are not required

In [7]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis=1)


In [8]:
#Removing the Duplicates
df2.shape



## Data Cleaning: Handling NA values

In [9]:
df2.isnull().sum()




In [10]:
df3 = df2.dropna()

In [11]:
df3.isnull().sum()



In [12]:
df3['size'].unique()



In [13]:
import string as str

In [14]:
df3['size'] = df3['size'].str.replace('Bedroom','BHK')



In [15]:
df3['size'] = df3['size'].str.replace('RK','BHK')




In [16]:
df3['size'].unique()



In [17]:
df3['rooms'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))



In [18]:
df3.head()



In [19]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True


In [20]:
df3[~df3['total_sqft'].apply(is_float)].head(10)



#### Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. I am going to just drop such corner cases to keep things simple

In [21]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   
        return np.nan

In [22]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head(2)



#### Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations

In [23]:
df4.location = df4.location.apply(lambda x: x.strip())
location_stats = df4['location'].value_counts(ascending=False)
location_stats



#### Dimensionality Reduction
##### Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [24]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10



In [25]:
len(df4['location'].unique())



In [26]:
df4.location = df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
df4.location.nunique()



### Outlier Removal
2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft



In [27]:
df4[df4.total_sqft/df4.rooms<300].head()



In [28]:
df5 = df4[~(df4.total_sqft/df4.rooms<300)]
df5.shape



# EDA


In [29]:

import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [30]:
df5[['total_sqft','rooms','bath','price']].corr()



In [31]:
numeric_features = df5.select_dtypes(include=[np.number])  # Selects only numerical columns


In [32]:
vif_data = pd.DataFrame()
vif_data["Feature"] = numeric_features.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_features.values, i) for i in range(len(numeric_features.columns))]

print(vif_data)




In [33]:
df5[['rooms', 'bath']].corr()



In [34]:
df6 = df5.drop(columns=['rooms'])

In [35]:
df6.info()



In [36]:
df6['size'].unique()



In [37]:
import seaborn as sns

In [38]:
sns.pairplot(df6)





In [39]:
sns.scatterplot(x='price',y='bath',data=df6)





In [40]:
sns.histplot(df6['price'],bins=50,kde=True)






In [41]:
sns.histplot(df6['total_sqft'],kde=True,bins=50)






In [42]:
df6[['price','total_sqft']]



In [43]:
df6['price'].plot(kind='box')






In [44]:
sns.scatterplot(x=df6['price'],y=df6['total_sqft'])





In [45]:
top_locations = df6['location'].value_counts().head(10).index
df_top = df6[df6['location'].isin(top_locations)]

plt.figure(figsize=(12, 6))
sns.boxplot(x=df_top['location'], y=df_top['price'])
plt.xticks(rotation=45)
plt.title("Price Distribution for Top 10 Locations")
plt.show()



In [46]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df6, x="total_sqft", y="price", alpha=0.5)

# Highlight luxury homes (above 4000 sqft)
luxury_homes = df6[df6["total_sqft"] > 4000]
sns.scatterplot(data=luxury_homes, x="total_sqft", y="price", color="red", label="Luxury Homes")

plt.xlabel("Total Square Feet")
plt.ylabel("Price")
plt.title("Price vs. Total Sqft")
plt.legend()
plt.show()




In [47]:
df6['bath'].plot(kind='kde',color='red')





# Training the model

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [49]:
df7 = df6.copy()
df7['price'] = np.log1p(df7['price'])

In [50]:
X = df7.drop(columns=['price'])
y = df7['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            return np.log1p(X.values).reshape(-1, 1)  # Fix reshape issue
        return np.log1p(X) 

In [52]:
num_features = ['bath']
cat_features = ['size', 'location']

In [53]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),  # Scale numerical features
    ("log", LogTransformer(),'total_sqft'),
    ('cat', OneHotEncoder(handle_unknown='ignore',sparse=False), cat_features),  # One-hot encode categorical features
],remainder='passthrough')

In [54]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model',RandomForestRegressor(
        max_depth= 30,
        max_features= 'sqrt',
        min_samples_leaf= 1,
        min_samples_split= 10,
        n_estimators= 200,                      
    random_state=42   
))  
])

In [55]:

# 🔹 Step 7: Train Model
pipeline.fit(X_train, y_train)




In [56]:
y_pred = pipeline.predict(X_test)


In [57]:
mean_squared_error(y_test,y_pred)



In [58]:
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2:.5f}')



In [59]:
train_r2 = pipeline.score(X_train, y_train)
print(f"Train R² Score: {train_r2:.5f}")




In [60]:
from sklearn.model_selection import cross_val_score

cv_r2 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-Validation R² Scores: {cv_r2}")
print(f"Average R² Score: {cv_r2.mean():.5f}")




In [61]:
query_data = pd.DataFrame({
    'location': ['Double Road'],  # Example location
    'size': ['3 BHK'],           # Example size (can be categorical, e.g., '2 BHK')
    'total_sqft': [500],        # Example total square feet
    'bath': [3]                  # Example number of bathrooms
})


In [62]:
import pickle

# Save the entire pipeline to a file
with open('house_price_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
