In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor

In [2]:
data = pd.read_csv("Cleaned_data.csv")
data.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3


In [3]:
x,y = data.drop(columns='price'),data['price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [4]:
x_train.shape,x_test.shape

((5888, 4), (1473, 4))

In [5]:
col_trans = make_column_transformer((OneHotEncoder(sparse_output=False),['location']),remainder="passthrough")

In [6]:
scaler = StandardScaler()

## Applying Linear Regression

In [7]:
pipe = make_pipeline(col_trans,scaler,LinearRegression())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(86.17007374637254, 82.5197701638146)

In [8]:
pipe = make_pipeline(col_trans,LinearRegression())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(86.17007374637254, 82.51977016381197)

## Applying Lasso

In [9]:
pipe = make_pipeline(col_trans,scaler,Lasso())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(84.93537906204466, 81.4689475169039)

In [10]:
pipe = make_pipeline(col_trans,Lasso())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(73.71606368647622, 67.1688263276343)

## Applying Ridge

In [11]:
pipe = make_pipeline(col_trans,scaler,Ridge())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(86.17006997170607, 82.5234850229012)

In [12]:
pipe = make_pipeline(col_trans,Ridge())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(86.03449670805759, 82.23431153904201)

## Applying KNN

In [13]:
pipe = make_pipeline(col_trans,scaler,KNeighborsRegressor()) # 7 (80.57297840111704, 80.6027140944164)
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(84.49544489464031, 80.0193175136129)

In [14]:
pipe = make_pipeline(col_trans,KNeighborsRegressor()) # 37 (67.86313123305679, 67.94446726577492)
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(79.64437152798445, 59.399456341076174)

In [15]:
for i in range(2,10):
    pipe = make_pipeline(col_trans,scaler,KNeighborsRegressor(n_neighbors=i))
    pipe.fit(x_train,y_train)
    print(i,pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100,pipe.score(x_train,y_train)*100-pipe.score(x_test,y_test)*100)

2 92.94803608384234 81.7776341699662 11.170401913876134
3 89.11885917840951 81.94179882649817 7.1770603519113365
4 85.82084862877973 81.3951701436477 4.425678485132025
5 84.49544489464031 80.0193175136129 4.4761273810274105
6 82.39700901107119 79.3158219608936 3.08118705017759
7 80.57297840111704 80.6027140944164 -0.02973569329935799
8 79.18640301484518 79.88222318617231 -0.6958201713271279
9 78.5825816142597 79.66290193367317 -1.0803203194134738


## Applying SVR

In [16]:
pipe = make_pipeline(col_trans,scaler,SVR())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(19.956624164931537, 22.403621394715667)

In [17]:
pipe = make_pipeline(col_trans,SVR())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(50.27805611627228, 57.727423214049644)

## Applying Decision Tree

In [18]:
pipe = make_pipeline(col_trans,scaler,DecisionTreeRegressor())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(99.2421488448164, 75.63141625499516)

In [19]:
pipe = make_pipeline(col_trans,DecisionTreeRegressor())
pipe.fit(x_train,y_train)
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(99.2421488448164, 73.56690019610204)

In [20]:
pg = {"criterion":["squared_error", "friedman_mse", "absolute_error","poisson"],
      "max_depth":[i for i in range(2,20)]}

In [21]:
pipe = make_pipeline(col_trans,scaler,GridSearchCV(DecisionTreeRegressor(),param_grid=pg))
pipe.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [22]:
pipe.score(x_train,y_train)*100,pipe.score(x_test,y_test)*100

(94.70156709868573, 76.55221780908151)