In [5]:
import pandas as pd

df = pd.read_csv('../data/preprocessed/preprocessed_data.csv')
X = df.drop(['price'],axis='columns')
y = df.price


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [7]:
from sklearn.linear_model import LinearRegression

# train the model using an Linear Regression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# score method will tell you the score
lr_clf.score(X_test,y_test)

0.8629132245229436

In [8]:
from sklearn.linear_model import LinearRegression

# train the model using an Linear Regression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# score method will tell you the score
lr_clf.score(X_test,y_test)

0.8629132245229436

<h2 style='color:white'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h2>

In [9]:
# K fold cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

# shuffle split and each fold has random rows or records
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.82702546, 0.86027005, 0.85322178, 0.8436466 , 0.85481502])

<h2 style='color:white'>Find best model using GridSearchCV</h2>
<p> Using the gridsearch it will tell you which best model and it will do the hyper parameter tunning as well </p>

In [10]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1, 1, 10],
                'fit_intercept': [True, False]
            }
        },
        'elasticnet': {
            'model': ElasticNet(),
            'params': {
                'alpha': [0.1, 1, 10],
                'l1_ratio': [0.1, 0.5, 0.9]
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [10, 50, 100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'svr': {
            'model': SVR(),
            'params': {
                'kernel': ['linear', 'rbf'],
                'C': [0.1, 1, 10],
                'gamma': ['scale', 'auto']
            }
        },
        'knn': {
            'model': KNeighborsRegressor(),
            'params': {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance']
            }
        }
    }

    scores = []
    
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

    # this is a dictonarie and which is inside that previous function
    for algo_name, config in algos.items():
        #  this cv para is Cross Validation
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        # it will tell you the best params in particular run
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    # pd.DataFrame will print it as tabular format
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model_using_gridsearchcv(X,y)

KeyboardInterrupt: 

In [None]:

"""
 predict the price of a house based on its location, square footage (sqft),
 number of bathrooms (bath), and number of bedrooms (bhk)
"""

"""
Find the Index of the Location:

loc_index = np.where(X.columns=='location2')[0][0] finds the index corresponding to
'location2' in X.columns, say it's 4.
Initialize the Feature Array:

x = np.zeros(len(X.columns)) creates an array of zeros, e.g., [0, 0, 0, 0, 0, 0, ...].
Set the Sqft, Bath, and BHK Values:

x[0] = 1200, x[1] = 2, x[2] = 3 sets the first three values in x, so x becomes [1200, 2, 3, 0, 0, 0, ...].
Set the One-Hot Encoded Location:

if loc_index >= 0: x[4] = 1 sets the value of the corresponding location feature
(for 'location2') to 1, so x becomes [1200, 2, 3, 0, 1, 0, ...].
Make the Prediction:

The trained model lr_clf.predict([x]) is used to predict the price based on these features, and
the predicted price is returned.
"""


import numpy as np
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    
    if loc_index >= 0:
        # we are setting 4th index value as 1(True) means it is existing in the table if the user entered
        # new place it will become zero.
        x[loc_index] = 1

    print(x)
    # The model expects input as a 2D array (i.e., a list of features), which is why [x] is used.
    # [0] will be return prediction
    return lr_clf.predict([x])[0]