In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings# to avoid the warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',0)
from sklearn.linear_model import LinearRegression,Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [5]:
data=pd.read_csv("After_EDA.csv",index_col=0)
data.head()

Unnamed: 0,location,total_sqft,bath,price,BHK,total_price_per_sqft,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,1st Block Jayanagar,2850.0,4.0,428.0,4.0,15017.54386,0.0,0.0,1.0
1,1st Block Jayanagar,1630.0,3.0,194.0,3.0,11901.840491,0.0,0.0,1.0
2,1st Block Jayanagar,1875.0,2.0,235.0,3.0,12533.333333,0.0,0.0,1.0
3,1st Block Jayanagar,1200.0,2.0,130.0,3.0,10833.333333,0.0,0.0,0.0
4,1st Block Jayanagar,1235.0,2.0,148.0,2.0,11983.805668,0.0,0.0,1.0


In [23]:
data.shape

(7361, 6)

In [6]:
data.columns

Index(['location', 'total_sqft', 'bath', 'price', 'BHK',
       'total_price_per_sqft', 'area_type_Carpet  Area',
       'area_type_Plot  Area', 'area_type_Super built-up  Area'],
      dtype='object')

In [7]:
data.drop(columns=['area_type_Carpet  Area',
       'area_type_Plot  Area', 'area_type_Super built-up  Area'],inplace=True,axis=1)

In [8]:
data.shape

(7361, 6)

In [9]:
data.columns

Index(['location', 'total_sqft', 'bath', 'price', 'BHK',
       'total_price_per_sqft'],
      dtype='object')

List of the model that we can use for our problem

1> LinearRegression model

2> Lasso Regression

3> Random Forest

4> Ridge  Regression


In [10]:
#input features for model
X=data.drop(['total_price_per_sqft','price'],axis=1)

#output features for model
y=data['price']

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=2)

### Applying Linear Regression


In [12]:
column_trans=make_column_transformer((OneHotEncoder(drop='if_binary'), ['location']), remainder='passthrough')
scaler=StandardScaler(with_mean=False)
lr_model = LinearRegression()

pipe=make_pipeline(column_trans,scaler,lr_model)
pipe.fit(X_train, y_train)

In [13]:
y_pred_linear=pipe.predict(X_test)
print("By using Linear Regression we got:")
print("Data accuracy",r2_score(y_test,y_pred_linear))

By using Linear Regression we got:
Data accuracy 0.8435141482829952


### Applying Lasso

In [14]:
lasso=Lasso()
pipe = make_pipeline(column_trans,scaler,lasso)
pipe.fit(X_train,y_train)

In [15]:
y_pred_lasso=pipe.predict(X_test)
print("By using Lasso Regression we got:")
print("Data accuracy",r2_score(y_test,y_pred_lasso))

By using Lasso Regression we got:
Data accuracy 0.8397552119405907


### Applying Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor
rfm=RandomForestRegressor()
pipe=make_pipeline(column_trans,scaler,rfm)
pipe.fit(X_train,y_train)

In [17]:
y_pred_random=pipe.predict(X_test)
print("By using Random Forest we got:")
print("Data accuracy",r2_score(y_test,y_pred_random))


By using Random Forest we got:
Data accuracy 0.8125441003043903


### Applying Ridge

In [18]:
ridge=Ridge()
pipe=make_pipeline(column_trans,scaler,ridge)
pipe.fit(X_train,y_train)

In [19]:
y_pred_ridge=pipe.predict(X_test)
print("By using Ridge Regression we got:")
print("Data accuracy",r2_score(y_test,y_pred_ridge))

By using Ridge Regression we got:
Data accuracy 0.8435340660731758


Saving the model for future use...

In [20]:
import pickle

In [21]:
pickle.dump(pipe,open('RidgeRegression.pkl','wb'))