In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('model.csv')

In [44]:
df.rename(columns={'Unnamed: 0':'id'},inplace=True)
df.columns

Index(['id', 'area_type', 'availability', 'size', 'total_sqft', 'bath',
       'balcony', 'price', 'Electronic City', 'Hebbal', 'Kanakpura Road',
       'Marathahalli', 'Raja Rajeshwari Nagar', 'Sarjapur  Road',
       'Thanisandra', 'Uttarahalli', 'Whitefield', 'Yelahanka'],
      dtype='object')

In [4]:
#Standardization

In [47]:
y = df['price']
X = df.drop(['price','id'],axis=1)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 

In [9]:
# hyper paramer tuning

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
lr = LinearRegression()

In [34]:
from pprint import pprint
print('Parameters currently in use:\n')
pprint(lr.get_params())

Parameters currently in use:

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}


In [35]:
from sklearn.model_selection import RandomizedSearchCV

copy_X = [True, False]
fit_intercept = [True, False]
normalize = [True, False]


In [36]:
# Create the random grid
random_grid = {'copy_X': copy_X,
               'fit_intercept':fit_intercept,
               'normalize': normalize}

In [37]:
pprint(random_grid)

{'copy_X': [True, False],
 'fit_intercept': [True, False],
 'normalize': [True, False]}


In [38]:
lr_random = RandomizedSearchCV(estimator = lr, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [25]:
lr_random.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.7s finished


RandomizedSearchCV(cv=5, estimator=LinearRegression(), n_iter=200, n_jobs=-1,
                   param_distributions={'copy_X': [True, False],
                                        'fit_intercept': [True, False],
                                        'normalize': [True, False]},
                   random_state=42, verbose=2)

In [40]:
#print("lr_random.best_estimator_ " ,lr_random.best_estimator_)
print(lr_random.best_score_)
print(lr_random.best_params_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [48]:
# Using GridSearchCV to find the best algorithm for this problem
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [49]:
def find_best_model(X,y):
    models = {
        'linear_regression': {
            'model': LinearRegression(),
            'parameters': {
                'normalize': [True,False]
            }
        },
        
        'lasso': {
            'model': Lasso(),
            'parameters': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'parameters': {
                'criterion': ['mse', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        }
    }
    
    scores = []
    cv_X_y = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
    
    for model_name, model_params in models.items():
        gs = GridSearchCV(model_params['model'], model_params['parameters'], cv=cv_X_y, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': model_name,
            'best_parameters': gs.best_params_,
            'accuracy': gs.best_score_
        })
        
    return pd.DataFrame(scores, columns=['model', 'best_parameters', 'accuracy'])

find_best_model(X, y)

Unnamed: 0,model,best_parameters,accuracy
0,linear_regression,{'normalize': True},0.383482
1,lasso,"{'alpha': 1, 'selection': 'random'}",0.381383
2,decision_tree,"{'criterion': 'friedman_mse', 'splitter': 'best'}",0.26644
