In [1]:
import numpy
import sklearn
import matplotlib
import pandas as pd
%matplotlib inline
matplotlib.rcParams["figure.figsize"]=(20,10)

In [2]:
df=pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [5]:
df.shape

(13320, 9)

In [6]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [7]:
df['bath'].head()

0    2.0
1    5.0
2    2.0
3    3.0
4    2.0
Name: bath, dtype: float64

In [8]:
type(df)

pandas.core.frame.DataFrame

In [9]:
type(df['bath'])

pandas.core.series.Series

In [10]:
#dropping some unimportant columns
df=df.drop(['area_type','society','availability','balcony'],axis='columns')

In [11]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
#data cleaning
#checking if any value is null or not
df.isnull().sum()


location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [13]:
#dropping that row which is null
df=df.dropna()#this function is use to drop those rows which is null

In [14]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [15]:
#Checking all unique value of size column
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [16]:
df['bhk']=df['size'].apply(lambda x: int(x.split(' ')[0]))#lambda function is a anonymous function(nameless function) which has only expression

In [17]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [18]:
df['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [19]:
#printing some fixed values rows.It is like querry in mysql
df[df.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [20]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [21]:
#we can observe that there is a range of values.
#so we have to overcome this problem
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [23]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [24]:
#converting range value into a single number
def sqft_to_num(x):
    a=x.split('-')
    if len(a)==2:
        return (float(a[0])+float(a[1]))/2
    try:
        return float(x)
    except:
        return None

In [25]:
df['total_sqft']=df['total_sqft'].apply(sqft_to_num)

In [26]:
df[df.price==265.000]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
648,Arekere,9 Bedroom,,9.0,265.0,9
1097,Jalahalli,5 BHK,3100.0,4.0,265.0,5
2667,Basavangudi,3 BHK,2150.0,3.0,265.0,3
3785,Nagarbhavi,3 Bedroom,1200.0,3.0,265.0,3
4338,Frazer Town,3 BHK,2560.0,3.0,265.0,3
9053,Kothanur,4 Bedroom,3400.0,5.0,265.0,4
9570,Chikkalasandra,5 Bedroom,1500.0,5.0,265.0,5
9748,Hosakerehalli,3 BHK,2480.0,4.0,265.0,3
10002,Whitefield,4 Bedroom,3940.0,5.0,265.0,4
10871,Lakshmiamma Garden,3 BHK,3000.0,3.0,265.0,3


In [27]:
df=df.dropna()

In [28]:
df[df.price==265.000]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1097,Jalahalli,5 BHK,3100.0,4.0,265.0,5
2667,Basavangudi,3 BHK,2150.0,3.0,265.0,3
3785,Nagarbhavi,3 Bedroom,1200.0,3.0,265.0,3
4338,Frazer Town,3 BHK,2560.0,3.0,265.0,3
9053,Kothanur,4 Bedroom,3400.0,5.0,265.0,4
9570,Chikkalasandra,5 Bedroom,1500.0,5.0,265.0,5
9748,Hosakerehalli,3 BHK,2480.0,4.0,265.0,3
10002,Whitefield,4 Bedroom,3940.0,5.0,265.0,4
10871,Lakshmiamma Garden,3 BHK,3000.0,3.0,265.0,3
11328,Infantry Road,4 BHK,2170.0,3.0,265.0,4


In [29]:
df.loc[1097][2]# to get a unique row by row number

3100.0

In [30]:
df.loc[30][2]

2475.0

In [31]:
5+2/2

6.0

In [32]:
#feature engineering---This is a method of transforming raw data into a more valuable feature
df['price_per_sqft']=(df['price']*100000)/df['total_sqft']
df.head(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [33]:
len(df['location'].unique())

1298

In [34]:
#As we know that machine learning can only work on numeric data.So now we will work on location column
df['location']=df['location'].apply(lambda x: x.strip())#strip() function will remove the whitespaces from begining and end froma a string.
location_stats= df.groupby('location')['location'].agg('count')
location_stats


location
1 Annasandrapalya                                  1
1 Giri Nagar                                       1
1 Immadihalli                                      1
1 Ramamurthy Nagar                                 1
12th cross srinivas nagar banshankari 3rd stage    1
                                                  ..
t.c palya                                          1
tc.palya                                           4
vinayakanagar                                      1
white field,kadugodi                               1
whitefiled                                         1
Name: location, Length: 1287, dtype: int64

In [35]:
len(location_stats)

1287

In [36]:
type(location_stats)

pandas.core.series.Series

In [37]:
max(location_stats)

533

In [38]:
location_stats.sort_values(ascending=False)

location
Whitefield              533
Sarjapur  Road          392
Electronic City         304
Kanakpura Road          264
Thanisandra             235
                       ... 
Kumbhena Agrahara         1
Kudlu Village,            1
Konappana Agrahara        1
Kodanda Reddy Layout      1
1 Annasandrapalya         1
Name: location, Length: 1287, dtype: int64

In [39]:
len(location_stats[location_stats<=10])

1047

In [40]:
location_stats_less_than_10=location_stats[location_stats<=10]
location_stats_less_than_10

location
1 Annasandrapalya                                  1
1 Giri Nagar                                       1
1 Immadihalli                                      1
1 Ramamurthy Nagar                                 1
12th cross srinivas nagar banshankari 3rd stage    1
                                                  ..
t.c palya                                          1
tc.palya                                           4
vinayakanagar                                      1
white field,kadugodi                               1
whitefiled                                         1
Name: location, Length: 1047, dtype: int64

In [41]:
df['location']=df['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x )

In [42]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,other,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [43]:
len(df.location.unique())

241

In [44]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [45]:
#our machine learning algorithm cannot work on string columns so we have to convert it into numeric column
#for this we have to do hot encoding.We use get_dummies method in pandas for this task.It will convert catagorical data into numeric data
dummies=pd.get_dummies(df.location)

In [46]:
dummies.head()

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
dummies.drop('other',axis='columns')

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df=pd.concat([df,dummies],axis='columns')

In [49]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
df=df.drop(['price_per_sqft','size','location'],axis='columns')

In [51]:
#Now we have to create independent variables for our training datasets.For this we will remove the price column from our dataframe.
x=df.drop('price',axis='columns')

In [52]:
x.head()

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1056.0,2.0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
#now thsi is dependent variable
y=df.price

In [54]:
y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [55]:
#now we have to split our dataset for training and testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [56]:
X_train.shape

(10560, 244)

In [57]:
x.shape

(13200, 244)

In [58]:
X_test.shape

(2640, 244)

In [59]:
Y_train.shape

(10560,)

In [60]:
Y_test.shape

(2640,)

In [61]:
#Here we are using linear regression model to train our data
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression()

In [62]:
lr.score(X_test,Y_test)

0.5120722457222469

In [63]:
#this is a regularized linear regression model which prevent data overfitting
from sklearn.linear_model import Ridge
ridge_reg=Ridge(alpha=0.1,solver='cholesky')
ridge_reg.fit(X_train,Y_train)

Ridge(alpha=0.1, solver='cholesky')

In [64]:
ridge_reg.score(X_test,Y_test)

0.5121182626875866

In [65]:
#normal linear regression model and regularized linear model both are not giving good accuracy.Lets try another regularized linear model.
#Now lets apply Lasso regression.This is same as ridge but it neglates unuseful features.
from sklearn.linear_model import Lasso
lasso_reg=Lasso(alpha=0.1)
lasso_reg.fit(X_train,Y_train)

Lasso(alpha=0.1)

In [66]:
lasso_reg.score(X_test,Y_test)

0.4965543924243595

In [67]:
#Lets try another regularized linear regression
#Elastic Net---It is mixture of ridge and lasso
from sklearn.linear_model import ElasticNet
elasticnet_reg=ElasticNet(alpha=0.1,l1_ratio=0.5)
elasticnet_reg.fit(X_train,Y_train)

ElasticNet(alpha=0.1)

In [68]:
elasticnet_reg.score(X_test,Y_test)

0.4650265843265926

In [69]:
#using k fold cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv= ShuffleSplit(n_splits=5 ,test_size=0.2,random_state=0)
cross_val_score(LinearRegression(),x,y,cv=cv)

array([0.49535067, 0.49928552, 0.38405532, 0.46847373, 0.34018018])

In [70]:
cross_val_score(elasticnet_reg,x,y,cv=cv)

array([0.4395168 , 0.47035671, 0.34392486, 0.40498603, 0.27723149])

In [71]:
cross_val_score(lasso_reg,x,y,cv=cv)

array([0.48403208, 0.49480134, 0.37559777, 0.45201   , 0.32951857])

In [72]:
cross_val_score(ridge_reg,x,y,cv=cv)

array([0.49530584, 0.499584  , 0.38408235, 0.46819299, 0.3401604 ])

In [73]:
#now we are using decision tree
from sklearn.tree import DecisionTreeRegressor
decisiontree=DecisionTreeRegressor()
decisiontree.fit(X_train,Y_train)

DecisionTreeRegressor()

In [74]:
decisiontree.score(X_test,Y_test)

0.5124712550456652

In [75]:
cross_val_score(decisiontree,x,y,cv=cv)

array([ 0.5090313 ,  0.480198  ,  0.36208378, -0.00731503, -0.1117766 ])

In [76]:
x.columns

Index(['total_sqft', 'bath', 'bhk', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Block Hbr Layout', '5th Phase JP Nagar',
       '6th Phase JP Nagar',
       ...
       'Vishveshwarya Layout', 'Vishwapriya Layout', 'Vittasandra',
       'Whitefield', 'Yelachenahalli', 'Yelahanka', 'Yelahanka New Town',
       'Yelenahalli', 'Yeshwanthpur', 'other'],
      dtype='object', length=244)

In [77]:
import numpy as np
np.where(x.columns=='total_sqft')[0][0]#it will give you the index value of the the column name given

0

In [78]:
#Now we have to provide inputs which is location,bhk,bath,sqft
def pred_price(location,sqft,bath,bhk):
    location_index=np.where(x.columns==location)[0][0]
    a=np.zeros(len(x.columns))
    a[0]=sqft
    a[1]=bath
    a[2]=bhk
    if location_index>=0:
        a[location_index]=1
    return lr.predict([a])[0] #the output is an 1d array having only 1 element so we need to print only 1st element of array.

In [79]:
x.loc[0]

total_sqft             1056.0
bath                      2.0
bhk                       2.0
1st Block Jayanagar       0.0
1st Phase JP Nagar        0.0
                        ...  
Yelahanka                 0.0
Yelahanka New Town        0.0
Yelenahalli               0.0
Yeshwanthpur              0.0
other                     0.0
Name: 0, Length: 244, dtype: float64

In [80]:
pred_price('1st Block Jayanagar',1056,2,2)

157.23701149897067

In [81]:
df.loc[0]

total_sqft             1056.00
bath                      2.00
price                    39.07
bhk                       2.00
1st Block Jayanagar       0.00
                        ...   
Yelahanka                 0.00
Yelahanka New Town        0.00
Yelenahalli               0.00
Yeshwanthpur              0.00
other                     0.00
Name: 0, Length: 245, dtype: float64

In [82]:
#exporting our model
import pickle
with open('home_price.pickle','wb') as f:
    pickle.dump(lr,f)

In [83]:
#Now to transport our data from web application to sever we need json file of our columns
#json-javascript object notation
import json
columns={
    'data_columns':[col.lower() for col in x.columns]
}#It is a dictionary
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [86]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X_train,Y_train):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X_train,Y_train)

Unnamed: 0,model,best_score,best_params
0,linear_regression,-0.982826,{'normalize': False}
1,lasso,0.0,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.2,"{'criterion': 'mse', 'splitter': 'best'}"


In [1]:
from sklearn.naive_bayes import MultinomialNB

In [2]:
model=MultinomialNB()