# Import Necessary libraries

In [188]:
import json
import pickle
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error


%matplotlib inline

In [189]:
# Load 
df = pd.read_csv(r'../data/v4_Feature_Engineering.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Phase JP Nagar,1875.0,3.0,167.0,3
1,1st Phase JP Nagar,1500.0,5.0,85.0,5
2,1st Phase JP Nagar,2065.0,4.0,210.0,3
3,1st Phase JP Nagar,2059.0,3.0,225.0,3
4,1st Phase JP Nagar,1394.0,2.0,100.0,2


<h2 style='color:white'>Use One Hot Encoding For Location</h2>

In [190]:
dummies = pd.get_dummies(df.location)
dummies.head().T

Unnamed: 0,0,1,2,3,4
1st Phase JP Nagar,True,True,True,True,True
5th Phase JP Nagar,False,False,False,False,False
6th Phase JP Nagar,False,False,False,False,False
7th Phase JP Nagar,False,False,False,False,False
8th Phase JP Nagar,False,False,False,False,False
...,...,...,...,...,...
Yelahanka,False,False,False,False,False
Yelahanka New Town,False,False,False,False,False
Yelenahalli,False,False,False,False,False
Yeshwanthpur,False,False,False,False,False


In [191]:
df = pd.concat([df,dummies.drop('other',axis='columns')],axis='columns')
df.head().T

Unnamed: 0,0,1,2,3,4
location,1st Phase JP Nagar,1st Phase JP Nagar,1st Phase JP Nagar,1st Phase JP Nagar,1st Phase JP Nagar
total_sqft,1875.0,1500.0,2065.0,2059.0,1394.0
bath,3.0,5.0,4.0,3.0,2.0
price,167.0,85.0,210.0,225.0,100.0
bhk,3,5,3,3,2
...,...,...,...,...,...
Yelachenahalli,False,False,False,False,False
Yelahanka,False,False,False,False,False
Yelahanka New Town,False,False,False,False,False
Yelenahalli,False,False,False,False,False


In [192]:
df = df.drop('location',axis='columns')
df.head().T

Unnamed: 0,0,1,2,3,4
total_sqft,1875.0,1500.0,2065.0,2059.0,1394.0
bath,3.0,5.0,4.0,3.0,2.0
price,167.0,85.0,210.0,225.0,100.0
bhk,3,5,3,3,2
1st Phase JP Nagar,True,True,True,True,True
...,...,...,...,...,...
Yelachenahalli,False,False,False,False,False
Yelahanka,False,False,False,False,False
Yelahanka New Town,False,False,False,False,False
Yelenahalli,False,False,False,False,False


In [193]:
df.shape

(6906, 218)

<h2 style='color:white'>Build a Model Now...</h2>

In [194]:
# Feature Selection
X = df.drop(['price'], axis='columns')
y = df.price

In [195]:
X.head().T

Unnamed: 0,0,1,2,3,4
total_sqft,1875.0,1500.0,2065.0,2059.0,1394.0
bath,3.0,5.0,4.0,3.0,2.0
bhk,3,5,3,3,2
1st Phase JP Nagar,True,True,True,True,True
5th Phase JP Nagar,False,False,False,False,False
...,...,...,...,...,...
Yelachenahalli,False,False,False,False,False
Yelahanka,False,False,False,False,False
Yelahanka New Town,False,False,False,False,False
Yelenahalli,False,False,False,False,False


In [196]:
X.shape

(6906, 217)

In [197]:
y.head()

0    167.0
1     85.0
2    210.0
3    225.0
4    100.0
Name: price, dtype: float64

In [198]:
y.shape

(6906,)

In [199]:
len(y)

6906

# Splitting The Data 

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [201]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [202]:
y_pred = model.predict(X_test)
print(f"r2_score: {r2_score(y_test, y_pred)}")
print(f"MAE(L1 Loss): {mean_absolute_error(y_test, y_pred)}")
# print(f"MSE(L2 Loss): {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")

r2_score: 0.8585921954851982
MAE(L1 Loss): 17.98659021645416
MAPE: 0.2171776045453777


In [203]:
model.score(X_test, y_test)

0.8585921954851982

<h3 style='color:white'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h3>

In [204]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
scores = cross_val_score(model, X, y, cv=cv)
scores

array([0.85926478, 0.85602052, 0.86644893, 0.80014233, 0.87821385])

We can see that in 5 iterations we get a score above 80% most of the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose

<h2 style='color:white'>Find best model using GridSearchCV</h2>

In [205]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'n_jobs': [None]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.852018,{'n_jobs': None}
1,lasso,0.713159,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.761355,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


**Based on above results we can say that LinearRegression gives the best score. Hence we will use that.**

<h2 style='color:white'>Test the model for few properties</h2>

In [206]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return model.predict([x])[0]

In [207]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

np.float64(86.50621332857222)

In [208]:
predict_price('1st Phase JP Nagar',1000, 3, 3)

np.float64(88.4973313297941)

In [209]:
predict_price('Indira Nagar',1000, 2, 2)

np.float64(183.46816681242478)

In [210]:
predict_price('Indira Nagar',1000, 3, 3)

np.float64(185.45928481364666)

<h3 style='color:white'>Export the tested model to a pickle file</h3>

In [211]:
with open('../artifacts/model.pkl','wb') as f:
    pickle.dump(model,f)

<h3 style='color:white'>Export location and column information to a file that will be useful later on in our prediction application</h3>

In [212]:
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("../artifacts/columns.json","w") as f:
    f.write(json.dumps(columns))

### This is for `src/components/e_model_trainer.py`

In [213]:
import os
import sys
import pandas as pd
import pickle
import json
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from src.exception import CustomException
from src.logger import configure_logger

logger = configure_logger("ModelTrainer")

class ModelTrainer:
    def find_best_model_using_gridsearchcv(self, X, y):
        # This mirrors the 'Best Model' logic from your notebook
        algos = {
            'linear_regression': {
                'model': LinearRegression(),
                'params': {'copy_X': [True, False]}
            },
            'lasso': {
                'model': Lasso(),
                'params': {'alpha': [1, 2], 'selection': ['random', 'cyclic']}
            },
            'decision_tree': {
                'model': DecisionTreeRegressor(),
                'params': {
                    'criterion': ['poisson', 'friedman_mse'],
                    'splitter': ['best', 'random']
                }
            }
        }
        scores = []
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        for algo_name, config in algos.items():
            gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
            gs.fit(X, y)
            scores.append({
                'model': algo_name,
                'best_score': gs.best_score_,
                'best_params': gs.best_params_
            })
        return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

    def initiate_model_trainer(self, train_df):
        try:
            logger.info("Starting Model Training and Hyperparameter Tuning...")
            
            # 1. One-Hot Encoding (Location)
            dummies = pd.get_dummies(train_df.location)
            df = pd.concat([train_df, dummies.drop('other', axis='columns')], axis='columns')
            df = df.drop('location', axis='columns')

            X = df.drop(['price'], axis='columns')
            y = df.price

            # 2. Find Best Model
            model_report = self.find_best_model_using_gridsearchcv(X, y)
            logger.info(f"Model Report Generated: \n{model_report}")

            # 3. Save the best model (Example: Linear Regression)
            # You can add logic here to automatically pick the winner from the report
            best_model = LinearRegression()
            best_model.fit(X, y)

            # 4. Export Artifacts
            os.makedirs('artifacts', exist_ok=True)
            with open("artifacts/model.pkl", "wb") as f:
                pickle.dump(best_model, f)
            
            columns = {'data_columns': [col.lower() for col in X.columns]}
            with open("artifacts/columns.json", "w") as f:
                json.dump(columns, f)

            logger.info("Model and Columns exported successfully to artifacts/")
            
        except Exception as e:
            raise CustomException(e, sys)