In [38]:
import pandas as pd
import numpy as np


df1 = pd.read_csv("C:/Users/praka/my_personal_project/monroe_house_price_prediction/data/bengaluru_house_prices.csv")

In [39]:
# train_df = df1.iloc[:5791,:]
# test_df = df1.iloc[5791:,:]

In [40]:
# test_df

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

In [42]:
class DropUnwantedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

In [43]:
class DropNullValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna()

In [44]:
class FeatureEngineeringBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['bhk'] = X['size'].apply(lambda x: int(x.split(' ')[0]))
        return X

In [45]:
class ConvertSqftToNum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def convert_sqft_to_num(x):
            tokens = x.split('-')
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
            try:
                return float(x)
            except:
                return None
        X['total_sqft'] = X['total_sqft'].apply(convert_sqft_to_num)
        X = X[X['total_sqft'].notnull()]
        return X

In [46]:
class FeatureEngineeringPricePerSqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['price_per_sqft'] = X['price'] * 100000 / X['total_sqft']
        return X

In [47]:
class FeatureEngineeringLocation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['location'] = X['location'].apply(lambda x: x.strip())
        location_stats = X['location'].value_counts(ascending=False)
        location_stats_less_than_10 = location_stats[location_stats <= 10]
        X['location'] = X['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
        return X

In [48]:
class OutlierRemovalPricepersqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[~(X.total_sqft/X.bhk<300)]
        df_out = pd.DataFrame()
        
        for key, subdf in X.groupby('location'):
            m = np.mean(subdf.price_per_sqft)
            st = np.std(subdf.price_per_sqft)
            reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
            df_out = pd.concat([df_out,reduced_df],ignore_index=True)
            
        return df_out

In [49]:
class OutlierRemovalBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        exclude_indices = np.array([])
        for location, location_df in X.groupby('location'):
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df.price_per_sqft),
                    'std': np.std(bhk_df.price_per_sqft),
                    'count': bhk_df.shape[0]
                }
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk-1)
                if stats and stats['count']>5:
                    exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
                    
        return X.drop(exclude_indices,axis='index')

In [50]:
class OutlierRemovalBathroom(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[X.bath<X.bhk+2]
        X = X.drop(['size','price_per_sqft'],axis='columns')
        
        return X

In [61]:
# Version 1
# class LocationOneHotEncoding(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         # location = [ "1st block jayanagar", "1st phase jp nagar", "2nd phase judicial layout", "2nd stage nagarbhavi", "5th block hbr layout", "5th phase jp nagar", "6th phase jp nagar", "7th phase jp nagar", "8th phase jp nagar", "9th phase jp nagar", "aecs layout", "abbigere", "akshaya nagar", "ambalipura", "ambedkar nagar", "amruthahalli", "anandapura", "ananth nagar", "anekal", "anjanapura", "ardendale", "arekere", "attibele", "beml layout", "btm 2nd stage", "btm layout", "babusapalaya", "badavala nagar", "balagere", "banashankari", "banashankari stage ii", "banashankari stage iii", "banashankari stage v", "banashankari stage vi", "banaswadi", "banjara layout", "bannerghatta", "bannerghatta road", "basavangudi", "basaveshwara nagar", "battarahalli", "begur", "begur road", "bellandur", "benson town", "bharathi nagar", "bhoganhalli", "billekahalli", "binny pete", "bisuvanahalli", "bommanahalli", "bommasandra", "bommasandra industrial area", "bommenahalli", "brookefield", "budigere", "cv raman nagar", "chamrajpet", "chandapura", "channasandra", "chikka tirupathi", "chikkabanavar", "chikkalasandra", "choodasandra", "cooke town", "cox town", "cunningham road", "dasanapura", "dasarahalli", "devanahalli", "devarachikkanahalli", "dodda nekkundi", "doddaballapur", "doddakallasandra", "doddathoguru", "domlur", "dommasandra", "epip zone", "electronic city", "electronic city phase ii", "electronics city phase 1", "frazer town", "gm palaya", "garudachar palya", "giri nagar", "gollarapalya hosahalli", "gottigere", "green glen layout", "gubbalala", "gunjur", "hal 2nd stage", "hbr layout", "hrbr layout", "hsr layout", "haralur road", "harlur", "hebbal", "hebbal kempapura", "hegde nagar", "hennur", "hennur road", "hoodi", "horamavu agara", "horamavu banaswadi", "hormavu", "hosa road", "hosakerehalli", "hoskote", "hosur road", "hulimavu", "isro layout", "itpl", "iblur village", "indira nagar", "jp nagar", "jakkur", "jalahalli", "jalahalli east", "jigani", "judicial layout", "kr puram", "kadubeesanahalli", "kadugodi", "kaggadasapura", "kaggalipura", "kaikondrahalli", "kalena agrahara", "kalyan nagar", "kambipura", "kammanahalli", "kammasandra", "kanakapura", "kanakpura road", "kannamangala", "karuna nagar", "kasavanhalli", "kasturi nagar", "kathriguppe", "kaval byrasandra", "kenchenahalli", "kengeri", "kengeri satellite town", "kereguddadahalli", "kodichikkanahalli", "kodigehaali", "kodigehalli", "kodihalli", "kogilu", "konanakunte", "koramangala", "kothannur", "kothanur", "kudlu", "kudlu gate", "kumaraswami layout", "kundalahalli", "lb shastri nagar", "laggere", "lakshminarayana pura", "lingadheeranahalli", "magadi road", "mahadevpura", "mahalakshmi layout", "mallasandra", "malleshpalya", "malleshwaram", "marathahalli", "margondanahalli", "marsur", "mico layout", "munnekollal", "murugeshpalya", "mysore road", "ngr layout", "nri layout", "nagarbhavi", "nagasandra", "nagavara", "nagavarapalya", "narayanapura", "neeladri nagar", "nehru nagar", "ombr layout", "old airport road", "old madras road", "padmanabhanagar", "pai layout", "panathur", "parappana agrahara", "pattandur agrahara", "poorna pragna layout", "prithvi layout", "r.t. nagar", "rachenahalli", "raja rajeshwari nagar", "rajaji nagar", "rajiv nagar", "ramagondanahalli", "ramamurthy nagar", "rayasandra", "sahakara nagar", "sanjay nagar", "sarakki nagar", "sarjapur", "sarjapur  road", "sarjapura - attibele road", "sector 2 hsr layout", "sector 7 hsr layout", "seegehalli", "shampura", "shivaji nagar", "singasandra", "somasundara palya", "sompura", "sonnenahalli", "subramanyapura", "sultan palaya", "tc palaya", "talaghattapura", "thanisandra", "thigalarapalya", "thubarahalli", "tindlu", "tumkur road", "ulsoor", "uttarahalli", "varthur", "varthur road", "vasanthapura", "vidyaranyapura", "vijayanagar", "vishveshwarya layout", "vishwapriya layout", "vittasandra", "whitefield", "yelachenahalli", "yelahanka", "yelahanka new town", "yelenahalli", "yeshwanthpur","other"]
#         location_dummies = pd.get_dummies(X['location'])
#         X = pd.concat([X, location_dummies.drop('other', axis = 'columns')], axis='columns')
#         X = X.drop('location',axis='columns')
        
#         return X

In [60]:
# Version 2
from sklearn.preprocessing import OrdinalEncoder

class LocationOneHotEncoding(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # location = [ "1st block jayanagar", "1st phase jp nagar", "2nd phase judicial layout", "2nd stage nagarbhavi", "5th block hbr layout", "5th phase jp nagar", "6th phase jp nagar", "7th phase jp nagar", "8th phase jp nagar", "9th phase jp nagar", "aecs layout", "abbigere", "akshaya nagar", "ambalipura", "ambedkar nagar", "amruthahalli", "anandapura", "ananth nagar", "anekal", "anjanapura", "ardendale", "arekere", "attibele", "beml layout", "btm 2nd stage", "btm layout", "babusapalaya", "badavala nagar", "balagere", "banashankari", "banashankari stage ii", "banashankari stage iii", "banashankari stage v", "banashankari stage vi", "banaswadi", "banjara layout", "bannerghatta", "bannerghatta road", "basavangudi", "basaveshwara nagar", "battarahalli", "begur", "begur road", "bellandur", "benson town", "bharathi nagar", "bhoganhalli", "billekahalli", "binny pete", "bisuvanahalli", "bommanahalli", "bommasandra", "bommasandra industrial area", "bommenahalli", "brookefield", "budigere", "cv raman nagar", "chamrajpet", "chandapura", "channasandra", "chikka tirupathi", "chikkabanavar", "chikkalasandra", "choodasandra", "cooke town", "cox town", "cunningham road", "dasanapura", "dasarahalli", "devanahalli", "devarachikkanahalli", "dodda nekkundi", "doddaballapur", "doddakallasandra", "doddathoguru", "domlur", "dommasandra", "epip zone", "electronic city", "electronic city phase ii", "electronics city phase 1", "frazer town", "gm palaya", "garudachar palya", "giri nagar", "gollarapalya hosahalli", "gottigere", "green glen layout", "gubbalala", "gunjur", "hal 2nd stage", "hbr layout", "hrbr layout", "hsr layout", "haralur road", "harlur", "hebbal", "hebbal kempapura", "hegde nagar", "hennur", "hennur road", "hoodi", "horamavu agara", "horamavu banaswadi", "hormavu", "hosa road", "hosakerehalli", "hoskote", "hosur road", "hulimavu", "isro layout", "itpl", "iblur village", "indira nagar", "jp nagar", "jakkur", "jalahalli", "jalahalli east", "jigani", "judicial layout", "kr puram", "kadubeesanahalli", "kadugodi", "kaggadasapura", "kaggalipura", "kaikondrahalli", "kalena agrahara", "kalyan nagar", "kambipura", "kammanahalli", "kammasandra", "kanakapura", "kanakpura road", "kannamangala", "karuna nagar", "kasavanhalli", "kasturi nagar", "kathriguppe", "kaval byrasandra", "kenchenahalli", "kengeri", "kengeri satellite town", "kereguddadahalli", "kodichikkanahalli", "kodigehaali", "kodigehalli", "kodihalli", "kogilu", "konanakunte", "koramangala", "kothannur", "kothanur", "kudlu", "kudlu gate", "kumaraswami layout", "kundalahalli", "lb shastri nagar", "laggere", "lakshminarayana pura", "lingadheeranahalli", "magadi road", "mahadevpura", "mahalakshmi layout", "mallasandra", "malleshpalya", "malleshwaram", "marathahalli", "margondanahalli", "marsur", "mico layout", "munnekollal", "murugeshpalya", "mysore road", "ngr layout", "nri layout", "nagarbhavi", "nagasandra", "nagavara", "nagavarapalya", "narayanapura", "neeladri nagar", "nehru nagar", "ombr layout", "old airport road", "old madras road", "padmanabhanagar", "pai layout", "panathur", "parappana agrahara", "pattandur agrahara", "poorna pragna layout", "prithvi layout", "r.t. nagar", "rachenahalli", "raja rajeshwari nagar", "rajaji nagar", "rajiv nagar", "ramagondanahalli", "ramamurthy nagar", "rayasandra", "sahakara nagar", "sanjay nagar", "sarakki nagar", "sarjapur", "sarjapur  road", "sarjapura - attibele road", "sector 2 hsr layout", "sector 7 hsr layout", "seegehalli", "shampura", "shivaji nagar", "singasandra", "somasundara palya", "sompura", "sonnenahalli", "subramanyapura", "sultan palaya", "tc palaya", "talaghattapura", "thanisandra", "thigalarapalya", "thubarahalli", "tindlu", "tumkur road", "ulsoor", "uttarahalli", "varthur", "varthur road", "vasanthapura", "vidyaranyapura", "vijayanagar", "vishveshwarya layout", "vishwapriya layout", "vittasandra", "whitefield", "yelachenahalli", "yelahanka", "yelahanka new town", "yelenahalli", "yeshwanthpur","other"]
        location_dummies = pd.get_dummies(X['location'])
        X = pd.concat([X, location_dummies.drop('other', axis = 'columns')], axis='columns')
        X = X.drop('location',axis='columns')
        
        return X

In [68]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


def location_transform_by_Ordinal_encoder():
    column = ['location']
    location = [ "1st block jayanagar", "1st phase jp nagar", "2nd phase judicial layout", "2nd stage nagarbhavi", "5th block hbr layout", "5th phase jp nagar", "6th phase jp nagar", "7th phase jp nagar", "8th phase jp nagar", "9th phase jp nagar", "aecs layout", "abbigere", "akshaya nagar", "ambalipura", "ambedkar nagar", "amruthahalli", "anandapura", "ananth nagar", "anekal", "anjanapura", "ardendale", "arekere", "attibele", "beml layout", "btm 2nd stage", "btm layout", "babusapalaya", "badavala nagar", "balagere", "banashankari", "banashankari stage ii", "banashankari stage iii", "banashankari stage v", "banashankari stage vi", "banaswadi", "banjara layout", "bannerghatta", "bannerghatta road", "basavangudi", "basaveshwara nagar", "battarahalli", "begur", "begur road", "bellandur", "benson town", "bharathi nagar", "bhoganhalli", "billekahalli", "binny pete", "bisuvanahalli", "bommanahalli", "bommasandra", "bommasandra industrial area", "bommenahalli", "brookefield", "budigere", "cv raman nagar", "chamrajpet", "chandapura", "channasandra", "chikka tirupathi", "chikkabanavar", "chikkalasandra", "choodasandra", "cooke town", "cox town", "cunningham road", "dasanapura", "dasarahalli", "devanahalli", "devarachikkanahalli", "dodda nekkundi", "doddaballapur", "doddakallasandra", "doddathoguru", "domlur", "dommasandra", "epip zone", "electronic city", "electronic city phase ii", "electronics city phase 1", "frazer town", "gm palaya", "garudachar palya", "giri nagar", "gollarapalya hosahalli", "gottigere", "green glen layout", "gubbalala", "gunjur", "hal 2nd stage", "hbr layout", "hrbr layout", "hsr layout", "haralur road", "harlur", "hebbal", "hebbal kempapura", "hegde nagar", "hennur", "hennur road", "hoodi", "horamavu agara", "horamavu banaswadi", "hormavu", "hosa road", "hosakerehalli", "hoskote", "hosur road", "hulimavu", "isro layout", "itpl", "iblur village", "indira nagar", "jp nagar", "jakkur", "jalahalli", "jalahalli east", "jigani", "judicial layout", "kr puram", "kadubeesanahalli", "kadugodi", "kaggadasapura", "kaggalipura", "kaikondrahalli", "kalena agrahara", "kalyan nagar", "kambipura", "kammanahalli", "kammasandra", "kanakapura", "kanakpura road", "kannamangala", "karuna nagar", "kasavanhalli", "kasturi nagar", "kathriguppe", "kaval byrasandra", "kenchenahalli", "kengeri", "kengeri satellite town", "kereguddadahalli", "kodichikkanahalli", "kodigehaali", "kodigehalli", "kodihalli", "kogilu", "konanakunte", "koramangala", "kothannur", "kothanur", "kudlu", "kudlu gate", "kumaraswami layout", "kundalahalli", "lb shastri nagar", "laggere", "lakshminarayana pura", "lingadheeranahalli", "magadi road", "mahadevpura", "mahalakshmi layout", "mallasandra", "malleshpalya", "malleshwaram", "marathahalli", "margondanahalli", "marsur", "mico layout", "munnekollal", "murugeshpalya", "mysore road", "ngr layout", "nri layout", "nagarbhavi", "nagasandra", "nagavara", "nagavarapalya", "narayanapura", "neeladri nagar", "nehru nagar", "ombr layout", "old airport road", "old madras road", "padmanabhanagar", "pai layout", "panathur", "parappana agrahara", "pattandur agrahara", "poorna pragna layout", "prithvi layout", "r.t. nagar", "rachenahalli", "raja rajeshwari nagar", "rajaji nagar", "rajiv nagar", "ramagondanahalli", "ramamurthy nagar", "rayasandra", "sahakara nagar", "sanjay nagar", "sarakki nagar", "sarjapur", "sarjapur  road", "sarjapura - attibele road", "sector 2 hsr layout", "sector 7 hsr layout", "seegehalli", "shampura", "shivaji nagar", "singasandra", "somasundara palya", "sompura", "sonnenahalli", "subramanyapura", "sultan palaya", "tc palaya", "talaghattapura", "thanisandra", "thigalarapalya", "thubarahalli", "tindlu", "tumkur road", "ulsoor", "uttarahalli", "varthur", "varthur road", "vasanthapura", "vidyaranyapura", "vijayanagar", "vishveshwarya layout", "vishwapriya layout", "vittasandra", "whitefield", "yelachenahalli", "yelahanka", "yelahanka new town", "yelenahalli", "yeshwanthpur","other"]
    pipeline = Pipeline(steps=[
        ('location_encoder',OrdinalEncoder(categories=[location]))
    ])
    
    preprocessor = ColumnTransformer([
        ('location_pipeline', pipeline, column)
    ])
    
    # transformed_df = preprocessor.fit_transform(data)
    
    return preprocessor

In [65]:
from sklearn.pipeline import Pipeline

def get_data_transform_pipeline():
    pipeline = Pipeline(steps=[
        ('drop_unwanted_columns', DropUnwantedColumns(columns_to_drop=['area_type', 'society', 'balcony', 'availability'])),
        ('drop_null_values', DropNullValues()),
        ('feature_engineering_bhk', FeatureEngineeringBHK()),
        ('convert_sqft_to_num', ConvertSqftToNum()),
        ('feature_engineering_price_per_sqft', FeatureEngineeringPricePerSqft()),
        ('feature_engineering_location', FeatureEngineeringLocation()),
        ('outlier_removal_pricepersqft', OutlierRemovalPricepersqft()),
        ('outlier_removal_bhk', OutlierRemovalBHK()),
        ('outlier_removal_bathroom', OutlierRemovalBathroom())
        # ('location_one_hot_encoding', LocationOneHotEncoding())
    ])
    return pipeline

def get_location_transform_pipeline():
    
    location_pipeline = Pipeline(steps = [
        ('location_one_hot_encoding', LocationOneHotEncoding())
    ])
    
    return location_pipeline

In [53]:
from sklearn.preprocessing import OrdinalEncoder

def get_location_transformed_df():
    location_pipeline = Pipeline(steps=[
        ('location_one_hot_encoding', LocationOneHotEncoding())
    ])
    
    return location_pipeline


def do_location_transform(data):
    loc_pipeline = get_data_transform_pipeline()
    
    location_df = loc_pipeline.fit_transform(data)
    
    return location_df

In [None]:
def 

In [69]:
pipeline = get_data_transform_pipeline()
location_pipeline = location_transform_by_Ordinal_encoder()

In [70]:
transformed_df = pipeline.fit_transform(df1)
location_transformed_df = location_pipeline.fit_transform(transformed_df)

ValueError: Found unknown categories ['Attibele', 'Kodihalli', 'Begur Road', 'Jalahalli East', 'Somasundara Palya', 'Billekahalli', 'Ananth Nagar', 'Margondanahalli', 'Thigalarapalya', 'Dodda Nekkundi', 'Hulimavu', 'Lakshminarayana Pura', 'Indira Nagar', 'Basaveshwara Nagar', 'Judicial Layout', 'Sarjapur  Road', 'Chandapura', 'Hoskote', 'Anandapura', 'Basavangudi', 'Kodichikkanahalli', 'EPIP Zone', 'Ulsoor', 'Rajaji Nagar', 'Kasavanhalli', 'Kanakpura Road', 'Nehru Nagar', 'Lingadheeranahalli', 'Kalena Agrahara', 'R.T. Nagar', 'Chikka Tirupathi', 'Garudachar Palya', 'Cooke Town', 'Sarakki Nagar', 'Yelahanka New Town', 'KR Puram', 'Kaggalipura', '7th Phase JP Nagar', 'Ramagondanahalli', 'Kanakapura', 'NRI Layout', 'NGR Layout', 'Hegde Nagar', 'Rajiv Nagar', 'BTM 2nd Stage', 'Magadi Road', 'Kammanahalli', 'Hennur', 'Panathur', 'Sompura', 'Jalahalli', 'Kudlu Gate', 'Balagere', 'Pattandur Agrahara', 'Brookefield', 'Banashankari Stage VI', 'Dasarahalli', 'HSR Layout', 'Kundalahalli', 'Malleshpalya', 'Singasandra', 'Channasandra', 'Kadugodi', '5th Phase JP Nagar', 'BTM Layout', 'Mahalakshmi Layout', 'Thubarahalli', 'ITPL', 'Subramanyapura', 'Old Madras Road', 'Anjanapura', 'Banjara Layout', 'Kasturi Nagar', 'Sector 7 HSR Layout', 'Thanisandra', 'Mico Layout', 'Laggere', 'JP Nagar', 'Battarahalli', 'Nagarbhavi', 'HAL 2nd Stage', 'AECS Layout', 'Bommenahalli', 'Devanahalli', 'Bharathi Nagar', 'Kalyan nagar', 'Kathriguppe', 'Kereguddadahalli', 'Sultan Palaya', 'OMBR Layout', 'Budigere', 'Bommasandra', 'Hebbal Kempapura', 'Kogilu', 'Yelachenahalli', 'Kudlu', 'Vasanthapura', 'Munnekollal', 'Bhoganhalli', 'Ambedkar Nagar', 'Ardendale', 'Akshaya Nagar', 'Chikkabanavar', 'Banashankari Stage II', 'Horamavu Agara', 'Doddaballapur', 'Babusapalaya', 'Green Glen Layout', 'ISRO Layout', 'Padmanabhanagar', 'Marsur', 'Sector 2 HSR Layout', 'Narayanapura', 'Iblur Village', 'Koramangala', 'Kothannur', 'Vittasandra', 'Kammasandra', 'Rayasandra', 'Yelenahalli', 'Hormavu', 'Hebbal', 'Karuna Nagar', 'Devarachikkanahalli', 'Malleshwaram', 'Horamavu Banaswadi', 'Vijayanagar', 'Poorna Pragna Layout', '8th Phase JP Nagar', 'CV Raman Nagar', 'Mysore Road', 'Nagavarapalya', 'Konanakunte', 'Electronics City Phase 1', 'Binny Pete', 'Varthur Road', 'Old Airport Road', 'Bannerghatta Road', 'Yeshwanthpur', 'Gunjur', 'Cunningham Road', 'Domlur', 'Banashankari', 'BEML Layout', 'Chikkalasandra', 'Kadubeesanahalli', 'Abbigere', '5th Block Hbr Layout', 'Ramamurthy Nagar', 'Pai Layout', 'Chamrajpet', 'Dasanapura', 'Nagasandra', 'Begur', 'Banashankari Stage III', 'Amruthahalli', 'Bannerghatta', 'Bommanahalli', 'Banaswadi', 'Marathahalli', 'Dommasandra', 'Seegehalli', '6th Phase JP Nagar', 'Shivaji Nagar', 'HRBR Layout', 'Sahakara Nagar', 'Tumkur Road', 'LB Shastri Nagar', 'Kenchenahalli', 'Jigani', 'Vidyaranyapura', 'Murugeshpalya', 'Cox Town', 'Hosur Road', 'Kengeri Satellite Town', 'Tindlu', 'Rachenahalli', 'Kodigehalli', 'Gubbalala', 'Doddathoguru', 'Banashankari Stage V', 'Talaghattapura', 'Electronic City Phase II', 'Anekal', 'Haralur Road', 'Kumaraswami Layout', 'HBR Layout', 'Sanjay nagar', 'Benson Town', 'Badavala Nagar', 'Sarjapura - Attibele Road', 'Sonnenahalli', 'Choodasandra', 'Vishveshwarya Layout', 'Whitefield', 'Arekere', 'Kodigehaali', 'Kothanur', 'Varthur', 'GM Palaya', 'Hosa Road', 'Kaikondrahalli', 'Bommasandra Industrial Area', 'Parappana Agrahara', '2nd Stage Nagarbhavi', 'Yelahanka', 'Frazer Town', 'Hennur Road', 'Sarjapur', 'Doddakallasandra', 'Kambipura', '1st Block Jayanagar', 'Harlur', '2nd Phase Judicial Layout', 'Bellandur', 'Kengeri', 'TC Palaya', 'Giri Nagar', 'Nagavara', 'Gollarapalya Hosahalli', 'Mallasandra', 'Uttarahalli', '9th Phase JP Nagar', '1st Phase JP Nagar', 'Kaggadasapura', 'Prithvi Layout', 'Neeladri Nagar', 'Hosakerehalli', 'Kaval Byrasandra', 'Shampura', 'Kannamangala', 'Mahadevpura', 'Jakkur', 'Bisuvanahalli', 'Gottigere', 'Ambalipura', 'Electronic City', 'Raja Rajeshwari Nagar', 'Hoodi', 'Vishwapriya Layout'] in column 0 during fit

In [59]:
location_transformed_df

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630.0,3.0,194.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875.0,2.0,235.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,130.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235.0,2.0,148.0,2,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10233,1200.0,2.0,70.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10234,1800.0,1.0,200.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10237,1353.0,2.0,110.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10238,812.0,1.0,26.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [126]:
transformed_df

Unnamed: 0,total_sqft,bath,price,bhk,1st block jayanagar,1st phase jp nagar,2nd phase judicial layout,2nd stage nagarbhavi,5th block hbr layout,5th phase jp nagar,...,vijayanagar,vishveshwarya layout,vishwapriya layout,vittasandra,whitefield,yelachenahalli,yelahanka,yelahanka new town,yelenahalli,yeshwanthpur
0,2850.0,4.0,428.0,4.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630.0,3.0,194.0,3.0,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875.0,2.0,235.0,3.0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,130.0,3.0,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235.0,2.0,148.0,2.0,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,,,,,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
233,,,,,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
235,,,,,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
237,,,,,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [20]:
# import os
# import sys
# from src.logger.logging import logging
# from src.exceptions.exception import customexception
# import pandas as pd

# from src.components.data_ingestion import DataIngestion
# from src.components.data_transformation import DataTransformation


# data_ingestion = DataIngestion()

# raw_data_path = data_ingestion.initiate_data_ingestion()

# data_transformation = DataTransformation()

# train_df, test_df = data_transformation.initialize_data_transformation(raw_data_path)

In [24]:
X = location_transformed_df.drop(['price'], axis = 'columns')
y = location_transformed_df['price']

In [29]:
X.shape

(7276, 243)

In [28]:
X.isnull().sum()

total_sqft               37
bath                     37
bhk                      37
1st block jayanagar    7035
1st phase jp nagar     7035
                       ... 
yelachenahalli         7035
yelahanka              7035
yelahanka new town     7035
yelenahalli            7035
yeshwanthpur           7035
Length: 243, dtype: int64

In [134]:
import numpy as np
train_array = np.array(train_df)

In [135]:
train_array = np.array(train_df)
test_array = np.array(test_df)


X_train, y_train, X_test, y_test = (
    train_array[:,:-1],
    train_array[:,-1],
    test_array[:,:-1],
    test_array[:,-1]
)

In [136]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()

In [137]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

# cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

# cross_val_score(LinearRegression(), X, y, cv=cv)

In [138]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['absolute_error','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scoring = ['neg_mean_squared_error', 'neg_median_absolute_error', 'r2']
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'],scoring = scoring, refit='r2', cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    # return pd.DataFrame(scores,columns=['model','best_score','best_params'])
    return scores

report = find_best_model_using_gridsearchcv(np.concatenate((X_train,X_test), axis =0), np.concatenate((y_train,y_test), axis =0))


In [139]:
report

[{'model': 'linear_regression',
  'best_score': 0.8450476036786242,
  'best_params': {'fit_intercept': False}},
 {'model': 'lasso',
  'best_score': 0.7068845811698203,
  'best_params': {'alpha': 1, 'selection': 'random'}},
 {'model': 'decision_tree',
  'best_score': 0.670512997103037,
  'best_params': {'criterion': 'friedman_mse', 'splitter': 'best'}}]

In [140]:
score = {result['model']: result['best_score'] for result in report}

In [141]:
score

{'linear_regression': 0.8450476036786242,
 'lasso': 0.7068845811698203,
 'decision_tree': 0.670512997103037}

In [150]:
data = {'total_sqft':2850.0,
                             'bath':4,
                             'bhk':4,
                             'location':"1st block jayanagar",}

In [151]:
predict_data = pd.DataFrame([data])

In [152]:
predict_data

Unnamed: 0,total_sqft,bath,bhk,location
0,2850.0,4,4,1st block jayanagar


In [157]:
location_df = do_location_transform(predict_data)

KeyError: "['area_type', 'society', 'balcony', 'availability'] not found in axis"

In [147]:
from src.utils.utils import  load_object

location_preprocessor_path = os.path.join("artifacts", "location_preprocessor.pkl")

preprocessor = load_object(location_preprocessor_path)

In [148]:
transformed_features = preprocessor.transform(predict_data)

KeyError: "['other'] not found in axis"

In [29]:
score

{'linear_regression': np.float64(0.8450476036786242),
 'lasso': np.float64(0.7068857757537745),
 'decision_tree': np.float64(0.6850479358428109)}

In [30]:
best_model_score = max(score.values())
best_model_name = [key for key, value in score.items() if value == best_model_score][0]

In [31]:
best_model_score

np.float64(0.8450476036786242)

In [32]:
best_model_name

'linear_regression'

In [33]:
best_params = [result['best_params']  for result in report if result['model']==best_model_name]

In [34]:
best_params

[{'fit_intercept': False}]

In [36]:
best_model = models[best_model_name]['params']
            
best_model.set_params(**best_params)
best_model.fit(np.concatenate((X_train, X_test), axis=0), np.concatenate((y_train, y_test), axis=0))

AttributeError: 'str' object has no attribute 'set_params'

In [25]:
report.to_dict()

{'model': {0: 'linear_regression', 1: 'lasso', 2: 'decision_tree'},
 'best_score': {0: 0.8450476036786242,
  1: 0.7068699256640066,
  2: 0.7180054780521311},
 'best_params': {0: {'fit_intercept': False},
  1: {'alpha': 1, 'selection': 'cyclic'},
  2: {'criterion': 'friedman_mse', 'splitter': 'random'}}}

In [79]:
import json

with open('./columns.json','r') as file:
    column_list = json.load(file)['data_columns']

In [84]:
location = 'begur'

loc_index = column_list.index(location)

In [86]:
location = 'begur'
total_sqft = 2400
bath = 3
bhk = 3
            
loc_index = column_list.index(location)

x = np.zeros(len(column_list))
x[0] = total_sqft
x[1] = bath
x[2] = bhk
if loc_index >= 0:
    x[loc_index] = 1



0.0

In [85]:
loc_index

44

In [80]:
column_list

['total_sqft',
 'bath',
 'bhk',
 '1st block jayanagar',
 '1st phase jp nagar',
 '2nd phase judicial layout',
 '2nd stage nagarbhavi',
 '5th block hbr layout',
 '5th phase jp nagar',
 '6th phase jp nagar',
 '7th phase jp nagar',
 '8th phase jp nagar',
 '9th phase jp nagar',
 'aecs layout',
 'abbigere',
 'akshaya nagar',
 'ambalipura',
 'ambedkar nagar',
 'amruthahalli',
 'anandapura',
 'ananth nagar',
 'anekal',
 'anjanapura',
 'ardendale',
 'arekere',
 'attibele',
 'beml layout',
 'btm 2nd stage',
 'btm layout',
 'babusapalaya',
 'badavala nagar',
 'balagere',
 'banashankari',
 'banashankari stage ii',
 'banashankari stage iii',
 'banashankari stage v',
 'banashankari stage vi',
 'banaswadi',
 'banjara layout',
 'bannerghatta',
 'bannerghatta road',
 'basavangudi',
 'basaveshwara nagar',
 'battarahalli',
 'begur',
 'begur road',
 'bellandur',
 'benson town',
 'bharathi nagar',
 'bhoganhalli',
 'billekahalli',
 'binny pete',
 'bisuvanahalli',
 'bommanahalli',
 'bommasandra',
 'bommasand