In [1]:
import pandas as pd
import numpy as np


df1 = pd.read_csv("C:/Users/praka/my_personal_project/monroe_house_price_prediction/data/bengaluru_house_prices.csv")

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

In [3]:
class DropUnwantedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

In [4]:
class DropNullValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna()

In [5]:
class FeatureEngineeringBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['bhk'] = X['size'].apply(lambda x: int(x.split(' ')[0]))
        return X

In [6]:
class ConvertSqftToNum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def convert_sqft_to_num(x):
            tokens = x.split('-')
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
            try:
                return float(x)
            except:
                return None
        X['total_sqft'] = X['total_sqft'].apply(convert_sqft_to_num)
        X = X[X['total_sqft'].notnull()]
        return X

In [7]:
class FeatureEngineeringPricePerSqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['price_per_sqft'] = X['price'] * 100000 / X['total_sqft']
        return X

In [8]:
class FeatureEngineeringLocation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['location'] = X['location'].apply(lambda x: x.strip())
        location_stats = X['location'].value_counts(ascending=False)
        location_stats_less_than_10 = location_stats[location_stats <= 10]
        X['location'] = X['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
        return X

In [10]:
class OutlierRemovalPricepersqft(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[~(X.total_sqft/X.bhk<300)]
        df_out = pd.DataFrame()
        
        for key, subdf in X.groupby('location'):
            m = np.mean(subdf.price_per_sqft)
            st = np.std(subdf.price_per_sqft)
            reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
            df_out = pd.concat([df_out,reduced_df],ignore_index=True)
            
        return df_out

In [11]:
class OutlierRemovalBHK(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        exclude_indices = np.array([])
        for location, location_df in X.groupby('location'):
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df.price_per_sqft),
                    'std': np.std(bhk_df.price_per_sqft),
                    'count': bhk_df.shape[0]
                }
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk-1)
                if stats and stats['count']>5:
                    exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
                    
        return X.drop(exclude_indices,axis='index')

In [12]:
class OutlierRemovalBathroom(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X[X.bath<X.bhk+2]
        X = X.drop(['size','price_per_sqft'],axis='columns')
        
        return X

In [13]:
class LocationOneHotEncoding(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        location_dummies = pd.get_dummies(X['location'])
        X = pd.concat([X, location_dummies.drop('other', axis = 'columns')], axis='columns')
        X = X.drop('location',axis='columns')
        
        return X

In [15]:
from sklearn.pipeline import Pipeline

def get_data_transform_pipeline():
    pipeline = Pipeline(steps=[
        ('drop_unwanted_columns', DropUnwantedColumns(columns_to_drop=['area_type', 'society', 'balcony', 'availability'])),
        ('drop_null_values', DropNullValues()),
        ('feature_engineering_bhk', FeatureEngineeringBHK()),
        ('convert_sqft_to_num', ConvertSqftToNum()),
        ('feature_engineering_price_per_sqft', FeatureEngineeringPricePerSqft()),
        ('feature_engineering_location', FeatureEngineeringLocation()),
        ('outlier_removal_pricepersqft', OutlierRemovalPricepersqft()),
        ('outlier_removal_bhk', OutlierRemovalBHK()),
        ('outlier_removal_bathroom', OutlierRemovalBathroom()),
        ('location_one_hot_encoding', LocationOneHotEncoding())
    ])
    return pipeline

In [16]:
pipeline = get_data_transform_pipeline()

In [17]:
transformed_df = pipeline.fit_transform(df1)

In [18]:
transformed_df

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630.0,3.0,194.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875.0,2.0,235.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,130.0,3,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235.0,2.0,148.0,2,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10233,1200.0,2.0,70.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10234,1800.0,1.0,200.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10237,1353.0,2.0,110.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10238,812.0,1.0,26.0,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
