In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

Reading the Dataset

In [None]:
df=pd.read_csv('Dataset.csv').head(20000)
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


Feature Engineering

In [7]:
df.dropna(inplace=True)
df['ADDRESS']=df['ADDRESS'].str.split(',').str[1]
address=df['ADDRESS'].value_counts()
idx=address[address<50].index
df=df[~df['ADDRESS'].isin(idx)]
df['BHK_OR_RK']=df['BHK_OR_RK'].apply(lambda x:1 if x=='BHK' else 0)
enc=ColumnTransformer(transformers=[
    ('enc_add',OneHotEncoder(drop='first'),['ADDRESS']),
    ('enc',OneHotEncoder(drop='first'),['POSTED_BY'])
],remainder='passthrough')
X=df.drop(columns=['TARGET(PRICE_IN_LACS)'])
y=df['TARGET(PRICE_IN_LACS)']
X=enc.fit_transform(X)


Training the Model

In [8]:

xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.25,random_state=42)
scale=StandardScaler(with_mean=False)
xtrain=scale.fit_transform(xtrain)
xtest=scale.transform(xtest)   
model=RandomForestRegressor(random_state=42)
model.fit(xtrain,ytrain)
pred=model.predict(xtest)


In [9]:
print(f"r2_score: {r2_score(ytest,pred)}")
print(f"Mean Absolute Error: {mean_absolute_error(ytest,pred)}")

r2_score: 0.9628054893760218
Mean Absolute Error: 31.020698066260774


Pickling model,scalar,encoder and storing the enocded district

In [None]:
with open('model.pkl','wb') as f:
    pkl.dump(model,f)
with open('scale.pkl','wb') as f:
    pkl.dump(scale,f)
with open('enc.pkl','wb') as f:
    pkl.dump(enc,f)

encoded_district=set(idx)
with open('encoded_district.txt','w') as f:
    for dist in encoded_district:
        f.write(dist+"\n")    