In [12]:
import pandas as pd

df = pd.read_csv('../data/preprocessed/preprocessed_data.csv')
X = df.drop(['price'],axis='columns')
y = df.price


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [14]:
from sklearn.linear_model import LinearRegression

# train the model using an Linear Regression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# score method will tell you the score
lr_clf.score(X_test,y_test)

0.8629132245229444

In [15]:
from sklearn.linear_model import LinearRegression

# train the model using an Linear Regression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# score method will tell you the score
lr_clf.score(X_test,y_test)

0.8629132245229444

<h2 style='color:white'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h2>

In [16]:
# K fold cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
import numpy as np

# shuffle split and each fold has random rows or records
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

scores_lr = cross_val_score(LinearRegression(), X, y, cv=cv)
print(scores_lr) #In the result of it, we can see the gap between maximum and minimum is not big. Therefore, we don't need to do hyperparameter tuning.
np.average(scores_lr)

[0.82702546 0.86027005 0.85322178 0.8436466  0.85481502]


0.8477957812447722

In [17]:
# Note: K fold cv gives accuracy (only?)

<h2 style='color:white'>Find best model using GridSearchCV</h2>
<p> Using the gridsearch it will tell you which best model and it will do the hyper parameter tunning as well </p>

In [18]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1, 1, 10],
                'fit_intercept': [True, False]
            }
        },
        'elasticnet': {
            'model': ElasticNet(),
            'params': {
                'alpha': [0.1, 1, 10],
                'l1_ratio': [0.1, 0.5, 0.9]
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        # 'random_forest': {
        #     'model': RandomForestRegressor(),
        #     'params': {
        #         'n_estimators': [10, 50, 100],
        #         'max_depth': [None, 10, 20],
        #         'min_samples_split': [2, 5, 10]
        #     }
        # },
        # 'gradient_boosting': {
        #     'model': GradientBoostingRegressor(),
        #     'params': {
        #         'n_estimators': [50, 100, 200],
        #         'learning_rate': [0.01, 0.1, 0.2],
        #         'max_depth': [3, 5, 7]
        #     }
        # },
        # 'svr': {
        #     'model': SVR(),
        #     'params': {
        #         'kernel': ['linear', 'rbf'],
        #         'C': [0.1, 1, 10],
        #         'gamma': ['scale', 'auto']
        #     }
        # },
        # 'knn': {
        #     'model': KNeighborsRegressor(),
        #     'params': {
        #         'n_neighbors': [3, 5, 7],
        #         'weights': ['uniform', 'distance']
        #     }
        # }
    }

    scores = []
    
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

    # this is a dictonarie and which is inside that previous function
    for algo_name, config in algos.items():
        #  this cv para is Cross Validation
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        # it will tell you the best params in particular run
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    # pd.DataFrame will print it as tabular format
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.847951,{'fit_intercept': False}
1,lasso,0.726809,"{'alpha': 2, 'selection': 'random'}"
2,ridge,0.84805,"{'alpha': 0.1, 'fit_intercept': False}"
3,elasticnet,0.756214,"{'alpha': 0.1, 'l1_ratio': 0.9}"
4,decision_tree,0.71569,"{'criterion': 'squared_error', 'splitter': 'be..."


In [25]:
from sklearn.model_selection import cross_val_score
import numpy as np

from sklearn.linear_model import LinearRegression

# train the model using an Linear Regression
lr_clf = LinearRegression(fit_intercept=False)

lr_clf.fit(X_train,y_train)
# score method will tell you the score
lr_clf.score(X_test,y_test)

y_pred = lr_clf.predict(X_test)


from sklearn.metrics import mean_squared_error 
print("mean_squared_error",mean_squared_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
print("root_mean_squared_error", mean_squared_error(y_test, y_pred, squared=False))

from sklearn.metrics import mean_absolute_error
print("mean_absolute_error",mean_absolute_error(y_test, y_pred))

from sklearn.metrics import r2_score
print("R-squared",r2_score(y_test, y_pred))


Adj_r2 = 1 - (1-r2_score(y_test, y_pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adjusted r2 score",Adj_r2)

from sklearn.metrics import mean_absolute_percentage_error
print("mean_absolute_percentage_error",mean_absolute_percentage_error(y_test, y_pred))

mean_squared_error 710.8059794200266
root_mean_squared_error 26.660944833595575
mean_absolute_error 16.156268053670473
R-squared 0.8629615011589108
Adjusted r2 score 0.8353033988180597
mean_absolute_percentage_error 0.1987442425801653




In [26]:
from sklearn.linear_model import Ridge

# train the model using an Linear Regression
ridge_model = Ridge(alpha=0.1, fit_intercept=False) #{'alpha': 0.1, 'fit_intercept': False}

ridge_model.fit(X_train,y_train)
# score method will tell you the score
ridge_model.score(X_test,y_test)

y_pred = ridge_model.predict(X_test)

# for i in zip(y_test, y_pred):
    # print(i)
    # break

from sklearn.metrics import mean_squared_error 
print("mean_squared_error",mean_squared_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
print("root_mean_squared_error", mean_squared_error(y_test, y_pred, squared=False))

from sklearn.metrics import mean_absolute_error
print("mean_absolute_error",mean_absolute_error(y_test, y_pred))

from sklearn.metrics import r2_score
print("R-squared",r2_score(y_test, y_pred))


Adj_r2 = 1 - (1-r2_score(y_test, y_pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adjusted r2 score",Adj_r2)

from sklearn.metrics import mean_absolute_percentage_error
print("mean_absolute_percentage_error",mean_absolute_percentage_error(y_test, y_pred))

mean_squared_error 711.4319490782134
root_mean_squared_error 26.67268170016306
mean_absolute_error 16.118159396313608
R-squared 0.8628408185186944
Adjusted r2 score 0.8351583591333479
mean_absolute_percentage_error 0.1978118759491989




In [28]:
from sklearn.linear_model import Lasso

# lasso_model = Lasso()
lasso_model = Lasso(alpha=2, selection='random')
lasso_model.fit(X_train,y_train)
y_pred = lasso_model.predict(X_test)

from sklearn.metrics import mean_squared_error 
print("mean_squared_error",mean_squared_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
print("root_mean_squared_error", mean_squared_error(y_test, y_pred, squared=False))

from sklearn.metrics import mean_absolute_error
print("mean_absolute_error",mean_absolute_error(y_test, y_pred))

from sklearn.metrics import r2_score
print("R-squared",r2_score(y_test, y_pred))


Adj_r2 = 1 - (1-r2_score(y_test, y_pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adjusted r2 score",Adj_r2)

from sklearn.metrics import mean_absolute_percentage_error
print("mean_absolute_percentage_error",mean_absolute_percentage_error(y_test, y_pred))

mean_squared_error 1488.5784899841306
root_mean_squared_error 38.582100642449866
mean_absolute_error 23.254978303216955
R-squared 0.7130123161864708
Adjusted r2 score 0.6550903833237733
mean_absolute_percentage_error 0.27505323769901




In [None]:
# Considering our target variable is not in 6 digits but in 2 digits(ie, 43 instead of 4,300,000)
# good or bad based is denoted based on higher or lower(h, l)

#          LR                                                   Lasso                                                 Lasso with best parameters
         
#          mean_squared_error 710.8059794200266                 mean_squared_error 1460.2194454865455                 mean_squared_error 1488.5784899841306                       
# h        root_mean_squared_error 26.660944833595575           root_mean_squared_error 38.21281781662464             root_mean_squared_error 38.582100642449866                                 
# l        mean_absolute_error 16.156268053670473               mean_absolute_error 23.157751000677226                mean_absolute_error 23.254978303216955                          
# h        R-squared 0.8629615011589108                         R-squared 0.7184797447099163                          R-squared 0.7130123161864708      
# h        Adjusted r2 score 0.8353033988180597                 Adjusted r2 score 0.661661287869808                   Adjusted r2 score 0.6550903833237733                     
# l        mean_absolute_percentage_error 0.1987442425801653    mean_absolute_percentage_error 0.27512403601036683    mean_absolute_percentage_error 0.27505323769901                                                 




#         Ridge
        
#         mean_squared_error 711.4319490782134
# h       root_mean_squared_error 26.67268170016306
# l       mean_absolute_error 16.118159396313608
# h       R-squared 0.8628408185186944
# h       Adjusted r2 score 0.8351583591333479
# l       mean_absolute_percentage_error 0.1978118759491989

In [29]:
#Since in aboves, the cross validation was not done when calculating the metrics such as mse and etc, we will now use cross validation to find y_pred and hences calculate the needed metrics++
#Using kfold cross validation
from sklearn.model_selection import cross_validate

print("Linear regression")
scoring = {'mse_my_value': 'neg_mean_squared_error', 'mae': 'neg_mean_absolute_error', 'r2': 'r2'}
scores = cross_validate(lr_clf, X, y, cv=5, scoring=scoring)
# print(scores)
print("MSE: ", -scores['test_mse_my_value'])
print("MAE: ", -scores['test_mae'])
print("R²: ", scores['test_r2'])
print()
print("Ridge")
scoring = {'mse': 'neg_mean_squared_error', 'mae': 'neg_mean_absolute_error', 'r2': 'r2'}
scores = cross_validate(ridge_model, X, y, cv=5, scoring=scoring)
# print(scores)
print("MSE: ", -scores['test_mse'])
print("MAE: ", -scores['test_mae'])
print("R²: ", scores['test_r2'])
print()
print("Lasso")
scoring = {'mse': 'neg_mean_squared_error', 'mae': 'neg_mean_absolute_error', 'r2': 'r2'}
scores = cross_validate(lasso_model, X, y, cv=5, scoring=scoring)
# print(scores)
print("MSE: ", -scores['test_mse'])
print("MAE: ", -scores['test_mae'])
print("R²: ", scores['test_r2'])

Linear regression
MSE:  [1.04037871e+18 6.14280903e+18 3.10254371e+03 2.59734803e+03
 2.68660785e+03]
MAE:  [7.17737800e+08 3.95137557e+08 3.82973991e+01 4.06393232e+01
 3.17669469e+01]
R²:  [-1.56709478e+14 -1.08365632e+15  5.83719779e-01  4.92052806e-01
  7.81392317e-01]

Ridge
MSE:  [3813.20579033 2491.47960968 3092.60610402 2586.2085067  2699.80012673]
MAE:  [43.50104463 39.37307955 38.16269217 40.48289215 31.69074324]
R²:  [0.42562695 0.56047671 0.58505315 0.49423129 0.78031887]

Lasso
MSE:  [2860.41668    1671.03093394 2140.52275421 1826.67115319 3317.28457973]
MAE:  [25.9659583  22.52777026 21.51430367 24.08477084 32.83928104]
R²:  [0.56914304 0.70521251 0.71279783 0.64276929 0.73007453]


In grid search cv, it does cross validation, but it considers only accuracy metric. But in here, the best hyperparameters are searched.
In cross_validates, it ofc does cross validation, but considers other metrics too. But in here, the best hyperparameters are not searched.
	So is there a method that can do that too?


In [7]:

"""
 predict the price of a house based on its location, square footage (sqft),
 number of bathrooms (bath), and number of bedrooms (bhk)
"""

"""
Find the Index of the Location:

loc_index = np.where(X.columns=='location2')[0][0] finds the index corresponding to
'location2' in X.columns, say it's 4.
Initialize the Feature Array:

x = np.zeros(len(X.columns)) creates an array of zeros, e.g., [0, 0, 0, 0, 0, 0, ...].
Set the Sqft, Bath, and BHK Values:

x[0] = 1200, x[1] = 2, x[2] = 3 sets the first three values in x, so x becomes [1200, 2, 3, 0, 0, 0, ...].
Set the One-Hot Encoded Location:

if loc_index >= 0: x[4] = 1 sets the value of the corresponding location feature
(for 'location2') to 1, so x becomes [1200, 2, 3, 0, 1, 0, ...].
Make the Prediction:

The trained model lr_clf.predict([x]) is used to predict the price based on these features, and
the predicted price is returned.
"""


import numpy as np
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    
    if loc_index >= 0:
        # we are setting 4th index value as 1(True) means it is existing in the table if the user entered
        # new place it will become zero.
        x[loc_index] = 1

    print(x)
    # The model expects input as a 2D array (i.e., a list of features), which is why [x] is used.
    # [0] will be return prediction
    return lr_clf.predict([x])[0]