In [1]:
import pandas as pd
import numpy as np


df1 = pd.read_csv("C:/Users/praka/my_personal_project/monroe_house_price_prediction/data/bengaluru_house_prices.csv")

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

In [3]:
class DropUnwantedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

In [4]:
class DropNullValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna()

In [5]:
class FeatureEngineeringBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['bhk'] = X['size'].apply(lambda x: int(x.split(' ')[0]))
        return X

In [6]:
class ConvertSqftToNum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def convert_sqft_to_num(x):
            tokens = x.split('-')
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
            try:
                return float(x)
            except:
                return None
        X['total_sqft'] = X['total_sqft'].apply(convert_sqft_to_num)
        X = X[X['total_sqft'].notnull()]
        return X

In [7]:
class FeatureEngineeringPricePerSqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['price_per_sqft'] = X['price'] * 100000 / X['total_sqft']
        return X

In [8]:
class FeatureEngineeringLocation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['location'] = X['location'].apply(lambda x: x.strip())
        location_stats = X['location'].value_counts(ascending=False)
        location_stats_less_than_10 = location_stats[location_stats <= 10]
        X['location'] = X['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
        return X

In [10]:
class OutlierRemovalPricepersqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[~(X.total_sqft/X.bhk<300)]
        df_out = pd.DataFrame()
        
        for key, subdf in X.groupby('location'):
            m = np.mean(subdf.price_per_sqft)
            st = np.std(subdf.price_per_sqft)
            reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
            df_out = pd.concat([df_out,reduced_df],ignore_index=True)
            
        return df_out

In [11]:
class OutlierRemovalBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        exclude_indices = np.array([])
        for location, location_df in X.groupby('location'):
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df.price_per_sqft),
                    'std': np.std(bhk_df.price_per_sqft),
                    'count': bhk_df.shape[0]
                }
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk-1)
                if stats and stats['count']>5:
                    exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
                    
        return X.drop(exclude_indices,axis='index')

In [12]:
class OutlierRemovalBathroom(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[X.bath<X.bhk+2]
        X = X.drop(['size','price_per_sqft'],axis='columns')
        
        return X

In [13]:
class LocationOneHotEncoding(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        location_dummies = pd.get_dummies(X['location'])
        X = pd.concat([X, location_dummies.drop('other', axis = 'columns')], axis='columns')
        X = X.drop('location',axis='columns')
        
        return X

In [15]:
from sklearn.pipeline import Pipeline

def get_data_transform_pipeline():
    pipeline = Pipeline(steps=[
        ('drop_unwanted_columns', DropUnwantedColumns(columns_to_drop=['area_type', 'society', 'balcony', 'availability'])),
        ('drop_null_values', DropNullValues()),
        ('feature_engineering_bhk', FeatureEngineeringBHK()),
        ('convert_sqft_to_num', ConvertSqftToNum()),
        ('feature_engineering_price_per_sqft', FeatureEngineeringPricePerSqft()),
        ('feature_engineering_location', FeatureEngineeringLocation()),
        ('outlier_removal_pricepersqft', OutlierRemovalPricepersqft()),
        ('outlier_removal_bhk', OutlierRemovalBHK()),
        ('outlier_removal_bathroom', OutlierRemovalBathroom()),
        ('location_one_hot_encoding', LocationOneHotEncoding())
    ])
    return pipeline

In [16]:
pipeline = get_data_transform_pipeline()

In [17]:
transformed_df = pipeline.fit_transform(df1)

In [18]:
transformed_df

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630.0,3.0,194.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875.0,2.0,235.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,130.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235.0,2.0,148.0,2,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10233,1200.0,2.0,70.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10234,1800.0,1.0,200.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10237,1353.0,2.0,110.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10238,812.0,1.0,26.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [1]:
import os
import sys
from src.logger.logging import logging
from src.exceptions.exception import customexception
import pandas as pd

from src.components.data_ingestion import DataIngestion
from src.components.data_transformation import DataTransformation


data_ingestion = DataIngestion()

raw_data_path = data_ingestion.initiate_data_ingestion()

data_transformation = DataTransformation()

train_df, test_df = data_transformation.initialize_data_transformation(raw_data_path)

In [2]:
train_df

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,price
0,3500.0,5.0,4,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,450.0
1,1900.0,3.0,3,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,160.0
2,1800.0,3.0,3,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,90.0
3,1210.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,69.0
4,1510.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,1139.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,60.0
5787,1495.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,110.0
5788,950.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,40.0
5789,6000.0,8.0,8,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,215.0


In [4]:
import numpy as np
train_array = np.array(train_df)

In [8]:
train_array = np.array(train_df)
test_array = np.array(test_df)


X_train, y_train, X_test, y_test = (
    train_array[:,:-1],
    train_array[:,-1],
    test_array[:,:-1],
    test_array[:,-1]
)

In [10]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()

In [11]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

# cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

# cross_val_score(LinearRegression(), X, y, cv=cv)

In [26]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['absolute_error','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scoring = ['neg_mean_squared_error', 'neg_median_absolute_error', 'r2']
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'],scoring = scoring, refit='r2', cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    # return pd.DataFrame(scores,columns=['model','best_score','best_params'])
    return scores

report = find_best_model_using_gridsearchcv(np.concatenate((X_train,X_test), axis =0), np.concatenate((y_train,y_test), axis =0))


In [27]:
report

[{'model': 'linear_regression',
  'best_score': np.float64(0.8450476036786242),
  'best_params': {'fit_intercept': False}},
 {'model': 'lasso',
  'best_score': np.float64(0.7068857757537745),
  'best_params': {'alpha': 1, 'selection': 'random'}},
 {'model': 'decision_tree',
  'best_score': np.float64(0.6850479358428109),
  'best_params': {'criterion': 'friedman_mse', 'splitter': 'random'}}]

In [28]:
score = {result['model']: result['best_score'] for result in report}

In [29]:
score

{'linear_regression': np.float64(0.8450476036786242),
 'lasso': np.float64(0.7068857757537745),
 'decision_tree': np.float64(0.6850479358428109)}

In [30]:
best_model_score = max(score.values())
best_model_name = [key for key, value in score.items() if value == best_model_score][0]

In [31]:
best_model_score

np.float64(0.8450476036786242)

In [32]:
best_model_name

'linear_regression'

In [33]:
best_params = [result['best_params']  for result in report if result['model']==best_model_name]

In [34]:
best_params

[{'fit_intercept': False}]

In [36]:
best_model = models[best_model_name]['params']
            
best_model.set_params(**best_params)
best_model.fit(np.concatenate((X_train, X_test), axis=0), np.concatenate((y_train, y_test), axis=0))

AttributeError: 'str' object has no attribute 'set_params'

In [25]:
report.to_dict()

{'model': {0: 'linear_regression', 1: 'lasso', 2: 'decision_tree'},
 'best_score': {0: 0.8450476036786242,
  1: 0.7068699256640066,
  2: 0.7180054780521311},
 'best_params': {0: {'fit_intercept': False},
  1: {'alpha': 1, 'selection': 'cyclic'},
  2: {'criterion': 'friedman_mse', 'splitter': 'random'}}}