In [1]:
#Importing Important Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)

In [2]:
#Loading our Dataset
df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,area_type,total_sqft,bath,price,bhk,Electronic City,Hebbal,Kanakpura Road,Marathahalli,Raja Rajeshwari Nagar,Sarjapur Road,Thanisandra,Uttarahalli,Whitefield,Yelahanka,other
0,2.0,1025.0,2,47.0,3,1,0,0,0,0,0,0,0,0,0,0
1,2.0,1500.0,2,64.5,3,1,0,0,0,0,0,0,0,0,0,0
2,0.0,1060.0,2,60.0,2,1,0,0,0,0,0,0,0,0,0,0
3,2.0,1152.0,2,64.5,2,1,0,0,0,0,0,0,0,0,0,0
4,0.0,770.0,1,36.0,2,1,0,0,0,0,0,0,0,0,0,0


In [3]:
X = df.drop('price' , axis = 'columns')
X.head()

Unnamed: 0,area_type,total_sqft,bath,bhk,Electronic City,Hebbal,Kanakpura Road,Marathahalli,Raja Rajeshwari Nagar,Sarjapur Road,Thanisandra,Uttarahalli,Whitefield,Yelahanka,other
0,2.0,1025.0,2,3,1,0,0,0,0,0,0,0,0,0,0
1,2.0,1500.0,2,3,1,0,0,0,0,0,0,0,0,0,0
2,0.0,1060.0,2,2,1,0,0,0,0,0,0,0,0,0,0
3,2.0,1152.0,2,2,1,0,0,0,0,0,0,0,0,0,0
4,0.0,770.0,1,2,1,0,0,0,0,0,0,0,0,0,0


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6008 entries, 0 to 6007
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   area_type              6008 non-null   float64
 1   total_sqft             6008 non-null   float64
 2   bath                   6008 non-null   int64  
 3   bhk                    6008 non-null   int64  
 4   Electronic City        6008 non-null   int64  
 5   Hebbal                 6008 non-null   int64  
 6   Kanakpura Road         6008 non-null   int64  
 7   Marathahalli           6008 non-null   int64  
 8   Raja Rajeshwari Nagar  6008 non-null   int64  
 9   Sarjapur  Road         6008 non-null   int64  
 10  Thanisandra            6008 non-null   int64  
 11  Uttarahalli            6008 non-null   int64  
 12  Whitefield             6008 non-null   int64  
 13  Yelahanka              6008 non-null   int64  
 14  other                  6008 non-null   int64  
dtypes: f

In [5]:
#X["total_sqft"]=tot_sqft
#X_scl.head()

We are dropping the price column because it is a dependent variable. We will train our model using independent variables.

In [6]:
#Making a new DataFrame for price
y = df.price
y.head()

0    47.0
1    64.5
2    60.0
3    64.5
4    36.0
Name: price, dtype: float64

In [7]:
# df without location (giving low accuracy )
X_4=X.iloc[:,:4]
X_4.drop('bath',axis='columns')
#train_test_split
from sklearn.model_selection import train_test_split
X4_train , X4_test , y4_train , y4_test = train_test_split(X_4 , y , test_size = 0.2 , random_state = 10)
X_4.head()

Unnamed: 0,area_type,total_sqft,bath,bhk
0,2.0,1025.0,2,3
1,2.0,1500.0,2,3
2,0.0,1060.0,2,2
3,2.0,1152.0,2,2
4,0.0,770.0,1,2


In [8]:
#train_test_split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_train_scl=scaler.fit_transform(X_train)
X_test_scl=scaler.transform(X_test)

We will train our model using 80% of the DataFrame and will test our model using 20% of the DataFrame.

### Linear Regression

Let's try Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train , y_train)
lr_clf.score(X_test,y_test)

0.8825924320638386

In [11]:
import pickle
pickle.dump(lr_clf,open('lr_clf.pkl','wb'))

NameError: name 'pickle' is not defined

Our LinearRegression model is giving an accuracy of 86.9% let's see if it can be improved by using K Fold Cross Validation.

### K Fold Cross Validation

In [None]:
#K - fold cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits = 5 , test_size = 0.2 , random_state = 0)
cross_val_score(LinearRegression() , X , y , cv = cv)

array([0.86619299, 0.89551432, 0.87627839, 0.88071143, 0.87972576])

We can see that all our folds - that means our LinearRegression model with data shuffled each time is giving results above 86%. Our score of 86% is not bad enough. 

Now we will try some other Regressors also and see whether they perform better than our LinearRegressor, we will use GridSearchCV for this.

### Finding best model using GridSearchCv

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from xgboost import XGBRegressor
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

#Function to choose the best model
def find_best_model(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'Rand_forest':{
            'model':RandomForestRegressor(),
            'params':{
                'n_estimators': [50, 100,150],
                'max_depth': [ 20, 30,40],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2,3],
            }
        },
        'ridge_regression': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1, 1, 10, 100],
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1, 1],
                'normalize': [True, False],
                'selection': ['random', 'cyclic'],
                
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
         }
           ,
        'xgboost': {
            'model': XGBRegressor(),
            'params': {
                'learning_rate': [0.05, 0.10, 0.15],
                'max_depth': [3, 5, 7],
                'n_estimators': [50, 100, 150]
            }
        }
           
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
        print(gs.best_params_)
    #return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])

#Calling find_best_model
#find_best_model(X,y)
        print('\nR2 score on test data = ',r2_score(y_test, gs.predict(X_test))) # This will print the test score in output 

    return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])

#Calling find_best_model
find_best_model(X,y) # Pass X_train and y_train instead of X and y ; try X_train_scaled and X_test_scaled too.


{'normalize': False}

R2 score on test data =  0.8661929863672646
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}

R2 score on test data =  0.8579997305107671
{'alpha': 10, 'normalize': False}

R2 score on test data =  0.8661961266104972
{'alpha': 0.1, 'normalize': False, 'selection': 'random'}

R2 score on test data =  0.8659754685048819
{'criterion': 'friedman_mse', 'splitter': 'random'}

R2 score on test data =  0.7881201166266971
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150}

R2 score on test data =  0.8713347628263122


Unnamed: 0,model,best_score,best_params
0,linear_regression,0.885098,{'normalize': False}
1,Rand_forest,0.886834,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
2,ridge_regression,0.885108,"{'alpha': 10, 'normalize': False}"
3,lasso,0.884756,"{'alpha': 0.1, 'normalize': False, 'selection'..."
4,decision_tree,0.829685,"{'criterion': 'friedman_mse', 'splitter': 'ran..."
5,xgboost,0.889709,"{'learning_rate': 0.05, 'max_depth': 3, 'n_est..."


In [None]:
# Stacking regressor
from sklearn.ensemble import StackingRegressor
estimators=[
    ('ridge',Ridge(alpha= 0.1, normalize= False)),
    #('lasso',Lasso(alpha= 0.1)),#, normalize= False)),
    #('DTree',DecisionTreeRegressor()),
    ('lr',LinearRegression()),
    ('randf',RandomForestRegressor(max_depth= 40, min_samples_leaf= 1, min_samples_split= 10, n_estimators= 100))
]

In [None]:
reg=StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge()
)

reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.8772308716068007

In [None]:
X4_train.head()

Unnamed: 0,area_type,total_sqft,bath,bhk
5086,2.0,1770.0,3,3
1723,2.0,1246.0,2,2
321,2.0,1300.0,3,3
4979,2.0,1230.0,2,2
3491,2.0,929.0,2,2


In [None]:
#voting
from sklearn.ensemble import VotingRegressor

voting_regressor = VotingRegressor(estimators=[
    ('ridge',Ridge(alpha= 0.1)),
    ('lasso',Lasso(alpha= 0.1)),
    ('DTree',DecisionTreeRegressor()),
    #('lr',LinearRegression())
])

cross_val_score(voting_regressor,X,y).mean()

0.8716799489524909

In [None]:
params={'weights':[(1,1,1),(1,1,2),(1,2,1),(2,1,1)]}

grid=GridSearchCV(voting_regressor,params)
grid.fit(X,y)
grid.best_params_

{'weights': (2, 1, 1)}

In [None]:
grid.best_score_

0.8731405561411894

In [None]:

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
import pandas as pd

#Function to choose the best model
def find_best_model(X,y):
    algos = {
        
        'xgboost': {
            'model': XGBRegressor(),
            'params': {
                'learning_rate': [0.01,0.1,0.05],
                'max_depth': [3,1,6],
                'n_estimators': [450, 500,600],
                #'gamma':[0.0,0.1,0.2],
                'colsample_bytree':[0.7,0.8],
                'min_child_weight':[1,5,10,15,17]
            }
        }
       ,
        'Rand_forest':{
            'model':RandomForestRegressor(),
            'params':{
                'n_estimators': [50,100,150],
                'max_depth': [ 20,30,40],
                'min_samples_split': [2,5,10],
                'min_samples_leaf': [3,1,2]
            }
        },
    
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
        print(gs.best_params_)
    #return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])

#Calling find_best_model
#find_best_model(X,y)
        print('\nR2 score on test data = ',r2_score(y_test, gs.predict(X_test))) # This will print the test score in output 

    return pd.DataFrame(scores , columns = ['model' , 'best_score' , 'best_params'])

#Calling find_best_model
find_best_model(X_train,y_train) # Pass X_train and y_train instead of X and y ; try X_train_scaled and X_test_scaled too.

KeyboardInterrupt: 

In [None]:
import pickle
pickle.dump(reg,open('reg.pkl','wb'))

NameError: name 'reg' is not defined

In [None]:
xgb=XGBRegressor(learning_rate= 0.1, max_depth= 3, n_estimators= 100);
xgb.fit(X,y)
xgb.score(X_test,y_test)

0.898467072638533

In [None]:
pickle.dump(xgb,open('xgb.pkl','wb'))

In [None]:
pickle.dump(lr_clf,open('lr_clf.pkl','wb'))

In [None]:
import sklearn

print("scikit-learn version:", sklearn.__version__)


scikit-learn version: 1.2.2
