In [22]:
import pandas as pd
import os
import numpy as np
import sklearn
import seaborn as sns

from IPython import display

# Because we have runned the code multiple times
import warnings
warnings.filterwarnings('ignore')

# Plot pretty:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score


from sklearn.model_selection import StratifiedShuffleSplit

from feature_engine import encoding


In [3]:
rawData = pd.read_csv("rawData_final.csv")
def get_shape(data):
    print("Shape of data: \n"
        " sample size: ", data.shape[0], 
        "\n feature size: " , data.shape[1])
get_shape(rawData) 

Shape of data: 
 sample size:  57011 
 feature size:  219


In [4]:
bolean_raw = rawData.duplicated(subset = 'address.gstKvhx', keep = 'last')
print("Number of duplicates (excl. the kept): ", len(bolean_raw[bolean_raw == True ]))
housing_data = rawData[bolean_raw == False]

get_shape(housing_data)
bolean_friHandel = housing_data['address.latestSale.saleType'] == 'Fri handel'
print("Removing non-fri handel: ", len(bolean_friHandel[bolean_friHandel == False]))
housing_data = housing_data[bolean_friHandel == True]

housing_data.replace('-', np.nan, inplace=True)
housing_data.replace(' - ', np.nan, inplace=True)

housing_data.head()

housing_data_idx = housing_data.reset_index()

get_shape(housing_data)

Number of duplicates (excl. the kept):  12206
Shape of data: 
 sample size:  44805 
 feature size:  219
Removing non-fri handel:  4841
Shape of data: 
 sample size:  39964 
 feature size:  219


In [5]:
sel_num_housing = ['salePrice_b', 'paymentCash_b',
       'AVM_pris_d', 'propertyValuation_b',
       'alfs_areaWeighted', 'buildYear_b', 'radonRiskCategory_d',
       'salesYear_b','Boligstørrelse', 'alfs_area', 
       'alfs_areaBasement', 'Kælder',
       'Vægtet Areal',
       'alfs_numberOfRooms', 'Antal værelser','alfs_buildYear_d',
       'propertyCharges',
       'alfs_postal', 
       'salesPeriod', 'address.latestForSale.dateAnnounced', 
       'address.latestForSale.dateAdded', 
       'address.latestForSale.dateRemoved', 
       'address.latestForSale.salesPeriodTotal',
       'areaResidential_b',
       'numberOfFloors_b', 'floor_b',
       'alfs_rebuildYear', 'Opførselsesår', 
       'Antal Etager', 'numberOfToilets_bd', 'numberOfBaths_bd',
       'turnoutVote_d', 'daycare_h',
       'doctor_h', 'hospital_h', 'junction_h', 'metro_h', 'school_h',
       'busstop_h', 'strain_h', 'supermarket_h', 'train_h', 'library_h',
       'pharmacy_h', 'coast_h', 'forest_h', 'lake_h', 'airport_h',
       'sportshall_h', 'publicbath_h', 'soccerfield_h', 'roadtrain_h',
       'priceIndex_s', 'priceChangeMPriorIndex_s', 'priceChangeYPriorIndex_s',
       'latitude_b', 'longitude_b',
       'Ombygningsår', 'rebuildYear_b',
        'breakInStatistic',
        'WaterHardness',
        "unemploymentRateCPH_s", "unemploymentRateDK_s", "mortgageRate_s", "OMXC20_s",
                  "Bygning, Samlet areal", "aboveSea_d"]


cat_features = ['postalId_b', 'usage_d', 'outerwall_d', 'roof_d', 
                'heating_d', 'biggestParty_d', 'noise_d','energyMark_b', 
                'radonRisk_d', 'floodingRisk_d', 'quarter_b', 'quarter0_b', 
                'kitchen.content_d', 'itemTypeName_b', 'itemtypeName', "city_b", "electionArea_d"]

removed_features = ["address.gstKvhx","municipalityNumber_b","address.oisPropertyNumber","address_b",
"street_b","streetName_b","address.itemType",
"address.itemTypeNumber","address.mapPosition.hasCoordinates","address.wishPropertyLocationLink",
"address.hasEnergyMark","address.energyMark","address.energyMarkLink",
"address.environmentData.soilContamination","address.environmentData.serviceStatus.renewTicket",
"address.environmentData.serviceStatus.errorCode","address.environmentData.serviceStatus.errorText",
"address.environmentData.serviceStatus.errorId",
"address.latestValuation.valuationYear","address.latestValuation.valuationDate",
"valuationDate_b","address.latestValuation.farmhouseParcelValuation",
"address.latestValuation.farmhousePropertyValuation",
"address.latestSale.saleTypeId",
"address.latestForSale.id","address.latestForSale.propertyNumberAgent","address.latestForSale.addressId",
"address.latestForSale.isArchive","address.latestForSale.uniqueNumber",
"address.latestForSale.description","address.latestForSale.descriptionHeadline",
"address.latestForSale.priceDevelopment" ,"address.latestForSale.priceDevelopmentHistoric",
"address.latestForSale.usageExpenses","address.latestForSale.paymentGross",
"address.latestForSale.paymentNet","address.latestForSale.paymentExpenses","address.latestForSale.itemType",
"address.latestForSale.itemTypeNumber","address.latestForSale.marketingItemType","address.latestForSale.address",
"address.latestForSale.streetName","address.latestForSale.houseNumber","address.latestForSale.city",
"address.latestForSale.placeName","address.latestForSale.placeNameSeparator",
"address.latestForSale.imageLink600X400",
"address.latestForSale.canShowSalesPeriodTotal",
"address.latestForSale.areaParcel",
"address.latestForSale.areaWeightedAsterix","address.latestForSale.areaWeightedTitleMessage",
"address.latestForSale.areaWeightedKrM2Title",
"address.latestForSale.agentChainName","address.latestForSale.agentId",
"address.latestForSale.agentsLogoLink",
"address.latestForSale.propertyLink","address.latestForSale.redirectLink",
"address.latestForSale.floorName","address.latestForSale.memberOfDe",
"address.latestForSale.hasEnergyMark","address.latestForSale.energyMarkLink",
"address.latestForSale.hasOpenHouse",
"address.latestForSale.nextOpenHouse",
"address.latestForSale.nextOpenHouseShort","address.latestForSale.nextOpenHouseSignup",
"address.latestForSale.municipalityNumber",
"address.latestForSale.oisPropertyNumber","address.latestForSale.isFavorite","address.latestForSale.hasComment",
"address.latestForSale.comment",
"address.latestForSale.rentalLink","address.latestForSale.linkDomain",
"address.latestForSale.propertyPartiallyOwnedFinancialData","address.latestForSale.mapPosition.hasCoordinates",
"address.latestForSale.mapPosition.latLng.lat","address.latestForSale.mapPosition.latLng.lng",
"address.latestForSale.videoRedirectLink","address.latestForSale.openHouseRedirectLink",
"address.latestForSale.projectSale",
"address.latestForSale.kvhx","address.latestForSale.gstKvhx","address.latestForSale.wishPropertyLocationLink",
"address.latestForSale.hasRentalLink","address.latestForSale.hasVideoLink",
"address.latestForSale.rating.ratings.conditionRating",
"address.latestForSale.rating.ratings.kitchenRating","address.latestForSale.rating.ratings.locationRating",
"address.latestForSale.rating.ratings.bathRating",
"address.latestForSale.rating.averageRating","address.latestForSale.rating.roundAverageRating",
"address.latestForSale.oisHidden","address.latestForSale.nextOpenHouseTime",
"address.latestForSale.calculateLoanAgentChain","address.latestForSale.label","address.latestForSale",
"address.latestValuation","address.environmentData.breakInStatistic","address.latestForSale.mapPosition.latLng",
"address.mapPosition.latLng","dingeo_link",
"address.latestSale","address.latestForSale.propertyPartiallyOwnedFinancialData.purchasePrice",
"address.latestForSale.propertyPartiallyOwnedFinancialData.maximumPriceRatio",
"address.latestForSale.propertyPartiallyOwnedFinancialData.maximumPrice",
"address.latestForSale.propertyPartiallyOwnedFinancialData.housingAssociationDebtShare",
"address.latestForSale.propertyPartiallyOwnedFinancialData.housingAssociationDebt",
"address.latestForSale.propertyPartiallyOwnedFinancialData.financingInformation",
"address.latestForSale.propertyPartiallyOwnedFinancialData.expenseNet",
"address.latestForSale.propertyPartiallyOwnedFinancialData.expenseGross",
"address.latestForSale.propertyPartiallyOwnedFinancialData.estimatedTechnicalPrice",
"address.latestForSale.propertyPartiallyOwnedFinancialData.estimatedTechnicalAreaPrice",
"address.latestForSale.propertyPartiallyOwnedFinancialData.downPayment",
"address.latestForSale.propertyPartiallyOwnedFinancialData.distributionRatio",
"Unnamed: 0", "address.latestForSale.hasAreaWeighted",
"address.latestSale.saleType", "saleDate_b","saleDate_b.1", "previousMonth",
                   "Bevaringsværdig", "Energimærke", "Fredning",
                   "Location", 
                   "address.latestForSale.areaPaymentCash",
                    "address.latestForSale.downPayment",
"address.latestValuation.parcelValuation",
        "address.environmentData.breakInStatistic.countryAverage",
        "address.environmentData.breakInStatistic.countyAverage",
        "address.environmentData.breakInStatistic.riskCategory",
        'priceHouse', 'priceChangeMHouse', 'priceChangeYHouse',
       'priceApartment', 'priceChangeMApart', 'priceChangeYApart']

In [10]:
class DropFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X_dropped = X.drop(self.variables, axis = 1)
        return X_dropped
    
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X.loc[:,self.variables]

class paymentAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables 
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        replace_func = np.vectorize(lambda x: float(x.replace('.','')))
        
        rep_X = replace_func(X_.astype(str))
        X_transformed = pd.DataFrame(rep_X, 
                         columns = self.variables)

        X__.drop(self.variables, axis= 1, inplace=True)
        X__[self.variables] = X_transformed[self.variables].values
        return X__
        

pay_vars = ['salePrice_b', 'paymentCash_b', 
            'AVM_pris_d', 'propertyValuation_b']

class itemAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        array_X = np.array(X_)
        itemType = np.where((array_X[:, 0] == 'Ejerlejlighed') | \
                            (array_X[:, 0] == 'Rækkehus') | \
                            (array_X[:, 0] == 'Villa') & \
                            (array_X[:, 1] != 'Andelsbolig'), 
                            array_X[:, 0], np.nan)
        
        X_transformed = pd.DataFrame(itemType, 
                         columns = [self.variables[0]])
        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[self.variables[0]] = X_transformed.values
        return X__
        
item_vars = ['itemTypeName_b', 'itemtypeName']   



class interiorAttributesAdder2(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):

        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        replace_func = np.vectorize(lambda x: float(x.replace(' m2','')))
        replace_func2 = np.vectorize(lambda x: float(x.replace('nan','0')))
        replace_func3 = np.vectorize(lambda x: float(x.replace(' meter','')))
        
        #rep_X = []
        #for i in self.variables:
        rep_X = replace_func2(replace_func(X_.loc[:,self.variables[:9]].astype(str)).astype(str))
        
        where0_X = np.where(rep_X[:, 1] == 'number', rep_X[:, 1], rep_X[:, 0])
        
        basement = np.where(rep_X[:, 2] == 'number',rep_X[:, 2], rep_X[:, 3])
        where1_X = np.where(basement < 500, 
                            basement, np.nan) #2:basement, 3: kælder
       
        where2_X = np.where(rep_X[:, 5] != 0, rep_X[:, 5], rep_X[:, 4]) 
        where3_X = np.where(rep_X[:, 6] == 'number',rep_X[:, 6], rep_X[:, 7])
        
        where4_X = rep_X[:, 8]
        where5_X = replace_func3(X_.loc[:, self.variables[9]].astype(str))
        
        where6_X = np.where((X_.loc[:, self.variables[10]] < 0) | (X_.loc[:, self.variables[10]] > 10), 
                            np.nan, X_.loc[:, self.variables[10]])
        
        concat_X = np.transpose([where0_X, where1_X, where3_X, where4_X, where5_X, where6_X]) #where2_X, 
        
    
        X_transformed = pd.DataFrame(concat_X, 
                         columns = [self.variables[1], self.variables[2], 
                                     self.variables[6], #self.variables[5],
                                    "BuildingUnion_area", self.variables[9],
                                    self.variables[10]])


        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[[self.variables[1], "areaBasement", 
             #self.variables[5], 
             "numberOfRooms", 
             "BuildingUnion_area", self.variables[9], self.variables[10]]] = X_transformed.values
        return X__

inter_vars = ['Boligstørrelse', 'alfs_area', 
              'alfs_areaBasement', 'Kælder',
              'Vægtet Areal', 'alfs_areaWeighted',
              'alfs_numberOfRooms', 'Antal værelser', 
              "Bygning, Samlet areal", 
              "aboveSea_d", 'numberOfToilets_bd']  


class interiorAttributesAdder2(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):

        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        replace_func = np.vectorize(lambda x: float(x.replace(' m2','')))
        replace_func2 = np.vectorize(lambda x: float(x.replace('nan','0')))
        replace_func3 = np.vectorize(lambda x: float(x.replace(' meter','')))
        
        rep_X = replace_func2(replace_func(X_.loc[:,self.variables[:8]].astype(str)).astype(str))
        
        where0_X = np.where(rep_X[:, 1] == 'number', rep_X[:, 1], rep_X[:, 0])
        
        basement = np.where(rep_X[:, 2] == 'number',rep_X[:, 2], rep_X[:, 3])
        where1_X = np.where(basement < 500, 
                            basement, np.nan) #2:basement, 3: kælder
       
        where2_X = np.where(rep_X[:, 5] != 0, rep_X[:, 5], rep_X[:, 4]) 
        where3_X = np.where(rep_X[:, 6] == 'number',rep_X[:, 6], rep_X[:, 7])
        
        #where4_X = rep_X[:, 8]
        where5_X = replace_func3(X_.loc[:, self.variables[8]].astype(str))
        
        where6_X = np.where((X_.loc[:, self.variables[9]] < 0) | (X_.loc[:, self.variables[9]] > 10), 
                            np.nan, X_.loc[:, self.variables[9]])
        
        concat_X = np.transpose([where0_X, where1_X, where3_X, where5_X, where6_X]) # where4_X, where2_X, 
        
    
        X_transformed = pd.DataFrame(concat_X, 
                         columns = [self.variables[1], self.variables[2], 
                                     self.variables[6], #self.variables[5],
                                    #"BuildingUnion_area", 
                                    self.variables[8],
                                    self.variables[9]])


        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[[self.variables[1], "areaBasement", 
             #self.variables[5], 
             "numberOfRooms", 
             #"BuildingUnion_area", 
             self.variables[8], self.variables[9]]] = X_transformed.values
        return X__

inter_vars2 = ['Boligstørrelse', 'alfs_area', 
              'alfs_areaBasement', 'Kælder',
              'Vægtet Areal', 'alfs_areaWeighted',
              'alfs_numberOfRooms', 'Antal værelser', 
              #"Bygning, Samlet areal", 
              "aboveSea_d", 'numberOfToilets_bd'] 


class rebuildYearAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables): 
        self.variables = variables
        
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:, self.variables]
        array_X = np.array(X_).astype(float)

        
        rebuildYear = np.where(array_X[:, 0] > 0, array_X[:, 0], array_X[:, 3])
        rebuildYear = np.where(array_X[:, 1] > 0, array_X[:, 1], rebuildYear)
        rebuildYear = np.where(array_X[:, 2] > 0, array_X[:, 2], rebuildYear)
        rebuildYear = np.where(rebuildYear == 0, np.nan, rebuildYear)
        rebuildYear = np.where(rebuildYear < 2022, rebuildYear, np.nan)

        buildYear = np.where(array_X[:, 3] == 0, np.nan, array_X[:, 3])
        
        concat_X = np.transpose([rebuildYear, buildYear])
        
    
        X_transformed = pd.DataFrame(concat_X, 
                         columns = [self.variables[0], self.variables[3]])
        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[[self.variables[0], self.variables[3]]] = X_transformed.values
        return X__
        

rebuild_vars = ['rebuildYear_b', 'alfs_rebuildYear',
               'Ombygningsår', 'buildYear_b']       

from datetime import datetime
class datetimeAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables 
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        array_X = np.array(X_)

        DateAnnounced = pd.to_datetime(array_X[:, 0], format='%d-%m-%Y', errors='coerce')
        DateAdded = pd.to_datetime(array_X[:, 1], format='%d-%m-%Y', errors='coerce')
        DateRemoved = pd.to_datetime(array_X[:, 2], format='%d-%m-%Y', errors='coerce')
        
        AddedRemoved = (DateRemoved - DateAdded).days.astype(float)
        AnnouncedRemoved = (DateRemoved - DateAnnounced).days.astype(float)
        PeriodTotal = array_X[:,3].astype(float)
        Period = array_X[:, 4].astype(float)
       
        SalesPeriod = np.where(PeriodTotal > 0, PeriodTotal, Period)
        SalesPeriod = np.where(SalesPeriod > 0, SalesPeriod, AddedRemoved)
        SalesPeriod = np.where(SalesPeriod > 0, SalesPeriod, AnnouncedRemoved)
            
        X_transformed = pd.DataFrame(SalesPeriod, 
                         columns = [self.variables[4]])
        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[self.variables[4]] = X_transformed.values
        return X__
    
salePeriod_vars = ['address.latestForSale.dateAnnounced', 'address.latestForSale.dateAdded', 
                   'address.latestForSale.dateRemoved', 'address.latestForSale.salesPeriodTotal',
                   'salesPeriod'] 

class postalAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        
        array_X = np.array(X_)


        bins = [0, 1100, 1200, 1300, 1400, 1450, 1500, 1600, 1700, 1800, 1850, 1900, 
                2000, 2001, 2101, 2151, 2201, 2301, 2401, 2451, 2501, 2701, 2721, 2770] 
        #The last number is for removal; Kastrup and Hellerup
        names = ['<1100', '1100-1200', '1200-1300', '1300-1400','1400-1450', '1450-1500', '1500-1600', 
                 '1600-1700', '1700-1800', '1800-1850','1850-1900','1900-2000','2000','2100','2150',
                 '2200','2300', '2400', '2450', '2500', '2700', '2720', '2720<']
        postalId = np.empty((len(X))).astype(str)

        num_postalId = np.searchsorted(bins, array_X).astype(int)
        for b in range(len(names)):
            postalId[num_postalId == (b+1)] = names[b]
        postalId = np.where(postalId == '0.0', '2720<', postalId)
        postalId = np.where(postalId == '2720<', np.nan, postalId)

        
        X_transformed = pd.DataFrame(postalId, 
                         columns = [self.variables])
    
        X__[self.variables] = X_transformed.values
        return X__

postal_vars = "postalId_b"

class energyAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        array_X = np.array(X_)

        energyMark = np.empty((len(X))).astype(str)

        energyMark = np.where((array_X == "a1") | (array_X == "a2") | \
                              (array_X == "a2010") | (array_X == "a2015"),
                              "a", array_X)
        
        X_transformed = pd.DataFrame(energyMark, 
                         columns = [self.variables])
    
        X__[self.variables] = X_transformed.values
        return X__

energy_vars = "energyMark_b"


class usageAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        array_X = np.array(X_)     
        usage = np.empty((len(X))).astype(str)


        usage = np.where(array_X  == "Bolig i etageejendom, flerfamiliehus eller to-familiehus",
                         "Apartment housing", "townhouse, chain house, semi-detached house")
        usage = np.where(array_X  == "Fritliggende enfamiliehus", "detached house", usage)
        
        X_transformed = pd.DataFrame(usage, 
                         columns = [self.variables])
    
        X__[self.variables] = X_transformed.values
        return X__


usage_vars = "usage_d"        


class rnfbAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        X__ = X.copy()
        X_ = X.loc[:,self.variables]
        array_X = np.array(X_)     
        
        radonRisk = np.where(array_X[:, 0] == "Meget lav", "a Very low", "b low")
        radonRisk = np.where(array_X[:, 0] == "medium", "c Medium", radonRisk)
        radonRisk = np.where(array_X[:, 0] == "høj", "d High", radonRisk)
        radonRisk = np.where(array_X[:, 0] == "meget høj", "e Very high", radonRisk)

        noise = np.where(np.where(array_X[:, 1] == "Mangler", "Ingen trafikstøj", 
                                  array_X[:, 1]) == "Ingen trafikstøj", "0-55 dB Noiseless", array_X[:, 1])
        noise = np.where(array_X[:, 1] == "over 75 dB", "75 dB or above", noise)

        floodingRisk = np.where(array_X[:, 2] == "er lav risiko", "a Low risk", "b Possible risk")
        floodingRisk = np.where(array_X[:, 2] == "er høj risiko", "c High risk", floodingRisk)

        biggestParty = np.where(array_X[:, 3] == "enhedslisten", "a Enhedslisten", "b Socialdemokratiet")
        biggestParty = np.where(array_X[:, 3] == "radikale", "c Radikale", biggestParty)
        biggestParty = np.where(array_X[:, 3] == "venstre", "d Vesntre", biggestParty)

        rnfb_X = np.transpose([radonRisk, noise, floodingRisk, biggestParty])
        X_transformed = pd.DataFrame(rnfb_X, 
                        columns = self.variables)
    
        X__[self.variables] = X_transformed.values
  
        return X__
        

rnfb_vars = ['radonRisk_d','noise_d', 'floodingRisk_d', 'biggestParty_d']   


class SimpleImputerCustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables, strategy):
        self.variables = variables
        self.strategy = strategy
        self.imp = SimpleImputer(missing_values = np.nan,   
                                 strategy = self.strategy)
    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.imp.fit(X_)
        return self
    def transform(self, X):
        X_ = X.loc[:,self.variables]
        X__ = X.copy()
        X_transformed = pd.DataFrame(self.imp.transform(X_), 
                         columns= self.variables)
        
        X__.drop(self.variables, axis= 1, inplace=True)
        X__[self.variables] = X_transformed[self.variables].values
        return X__

class OutlierRemover(BaseEstimator,TransformerMixin):
    def __init__(self, variables, factor=1.5):
        self.variables = variables
        self.factor = factor
        
    def outlier_detector(self,X,y=None):
        X = pd.Series(X).copy()
        q1 = X.quantile(0.025)
        q3 = X.quantile(0.975)
        iqr = q3 - q1
        self.lower_bound.append(q1)# - (self.factor * iqr))
        self.upper_bound.append(q3)# + (self.factor * iqr))

    def fit(self,X,y=None):
        X_ = pd.DataFrame(X.loc[:, self.variables])
        self.lower_bound = []
        self.upper_bound = []
        X_.apply(self.outlier_detector)
        return self
    
    def transform(self,X,y=None):
        X_ = pd.DataFrame(X.loc[:, self.variables])
        X__ = X.copy()
        for i in range(X_.shape[1]):
            x = X_.iloc[:, i].copy()
            x[(x < self.lower_bound[i]) | (x > self.upper_bound[i])] = np.nan
            X__[self.variables[i]] = x
           
        return X__

    
class OutlierRemover2(BaseEstimator,TransformerMixin):
    def __init__(self, variables, factor = 0.025):
        self.variables = variables[:-1]
        self.areavar = variables[len(variables)-1]
        self.factor = factor
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        #X_ = pd.DataFrame(X.loc[:, self.variables[0]])
        Xsqr = X.loc[:,self.variables[0]] / X.loc[:,self.areavar]
        
        q_down = Xsqr.quantile(self.factor)
        q_up = Xsqr.quantile(1-self.factor)
        
        X__ = X.copy()
        
        X_outlier = np.where((Xsqr < q_down) | (Xsqr > q_up), np.nan, X.loc[:, self.variables[0]] )
        X__[self.variables[0]] = X_outlier
           
        return X__

    
class MakeToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X__ = pd.DataFrame(X,
                          columns = self.variables)
        return X__

class OneHotEncodercustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.ohe = OneHotEncoder(drop = 'first', handle_unknown = "error")
    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.ohe.fit(X_)
        return self
    def transform(self, X):
        X_ = X.loc[:,self.variables]
        X__ = X.copy()
        
        X_transformed = pd.DataFrame(self.ohe.transform(X_).toarray(),
                                    columns = self.ohe.get_feature_names_out())
        X__.drop(self.variables, axis = 1, inplace = True)
        X__[self.ohe.get_feature_names_out()] = \
            X_transformed[self.ohe.get_feature_names_out()].values
        return X__
    
class OrdinalEncodercustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.od = OrdinalEncoder(handle_unknown = 'error')
    def fit(self, X, y = None):
        X_ = X.loc[:,self.variables]
        self.od.fit(X_)
        return self
    def transform(self, X):
        X_ = X.loc[:,self.variables]
        X__ = X.copy()
        
        od_X = self.od.transform(X_)
        X_transformed = pd.DataFrame(od_X, 
                                     columns = self.variables)
        
        X__.drop(self.variables, axis= 1, inplace=True)
        
        X__[self.variables] = \
                X_transformed[self.variables].values
        return X__
    
class StandardScalercustom(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.ss = StandardScaler()
    def fit(self, X, y = None):
        X_ = X.loc[:, X.columns != self.variables]
        self.ss.fit(X_)
        return self
    def transform(self, X):
        X_ = X.loc[:, X.columns != self.variables]
        #columns_list = list(X.columns.difference([self.variables]))
        columns_list = list(X.loc[:, X.columns != self.variables].columns)
        X__ = X.copy()
        
        ss_X = self.ss.transform(X_)
        X_transformed = pd.DataFrame(ss_X, 
                                     columns = columns_list)
        
        X__.drop(columns_list, axis= 1, inplace=True)
        
        X__[columns_list] = \
                X_transformed[columns_list].values
        return X__
    
    
def drop_nans(X, y=None):
    total = X.shape[1]                                           
    #new_thresh = total - thresh
    df = pd.DataFrame(X)
    df.dropna(inplace=True) #thresh=new_thresh,
    df = np.array(df)
    return df

In [11]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 

housing_with_idx = housing_data.reset_index()

for train_index, test_index in split.split(housing_with_idx, housing_with_idx['city_b']):
        strat_train_set = housing_with_idx.loc[train_index]
        strat_test_set = housing_with_idx.loc[test_index]
        
print("train set shape: ", strat_train_set.shape)
print("test set shape: ", strat_test_set.shape)
print(strat_test_set['city_b'].value_counts() / len(strat_test_set))
print(housing_with_idx['city_b'].value_counts() / len(housing_with_idx))

train set shape:  (31971, 220)
test set shape:  (7993, 220)
København S        0.232954
København Ø        0.118729
Frederiksberg      0.098211
Valby              0.077943
København K        0.072814
København N        0.068185
København SV       0.058301
København V        0.058301
Vanløse            0.054172
København NV       0.048542
Frederiksberg C    0.048042
Brønshøj           0.048042
Nordhavn           0.015764
Name: city_b, dtype: float64
København S        0.232910
København Ø        0.118682
Frederiksberg      0.098238
Valby              0.077920
København K        0.072866
København N        0.068211
København SV       0.058277
København V        0.058252
Vanløse            0.054124
København NV       0.048619
Brønshøj           0.048068
Frederiksberg C    0.048068
Nordhavn           0.015764
Name: city_b, dtype: float64


In [18]:
setA = set(sel_num_housing)
removed_on_the_go = ["propertyValuation_b", "paymentCash_b", "AVM_pris_d", 'alfs_postal',
                     'priceChangeYPriorIndex_s', 'longitude_b', 'latitude_b', 'Antal Etager', #What remove this round
                     'WaterHardness', 'quarter0_b', 'city_b', 'Opførselsesår', "itemTypeName_b", 'itemtypeName',
                     "Bygning, Samlet areal", 'areaResidential_b', 
                     'unemploymentRateDK_s', 'priceChangeYPriorIndex_s', 'alfs_buildYear_d',
                     'kitchen.content_d', 'heating_d', 'electionArea_d']
# Get new set with elements that are only in a but not in b
updated_sel_num = setA.difference(removed_on_the_go)

setB = set(cat_features)
updated_cat_feat = setB.difference(removed_on_the_go)


drop_features = DropFeatureSelector(variables = removed_features + removed_on_the_go)
num_feature_selector = FeatureSelector(variables = updated_sel_num) # + ["city_b"])# to make boxplot
cat_feature_selector = FeatureSelector(variables = updated_cat_feat)
num_paymentAttributesAdder = paymentAttributesAdder(variables = ["salePrice_b"])
num_interiorAttributesAdder = interiorAttributesAdder2(variables = inter_vars2)
num_rebuildYearAttributesAdder = rebuildYearAttributesAdder(variables = rebuild_vars)
num_datetimeAttributesAdder = datetimeAttributesAdder(variables = salePeriod_vars)

num_StandardScalercustom = StandardScalercustom(variables = 'salePrice_b' )


#outlier_vars = ['salePrice_b']
#num_outlier_remover = OutlierRemover(variables = outlier_vars)
outlier_vars2 = ['salePrice_b', 'alfs_area']
num_outlier_remover = OutlierRemover2(variables = outlier_vars2, factor = 0.05)


cat_postalAttributesAdder = postalAttributesAdder(variables = postal_vars)
cat_energyAttributesAdder = energyAttributesAdder(variables = energy_vars)
cat_usageAttributesAdder = usageAttributesAdder(variables = usage_vars)
cat_rnfbAttributesAdder = rnfbAttributesAdder(variables = rnfb_vars)
cat_most_frequent_imputer = SimpleImputerCustom(variables = [#'kitchen.content_d',
                                                             'outerwall_d',
                                                             'roof_d'], 
                                                strategy = 'most_frequent')


onehot_housing_2 = ['postalId_b', 'usage_d', 'outerwall_d', 'roof_d', 
                    'biggestParty_d']

cat_OneHotEncodercustom = OneHotEncodercustom(variables = onehot_housing_2)
ord_housing_2 = ['noise_d', 'energyMark_b', 
                'radonRisk_d', 'floodingRisk_d',
                'quarter_b']
cat_OrdinalEncodercustom = OrdinalEncodercustom(variables = ord_housing_2)


rare_encoder = encoding.RareLabelEncoder(tol=0.015, n_categories=2,
                    variables=['outerwall_d', 'roof_d'], replace_with='rare') ### This one is added


complete_drop_nans = FunctionTransformer(drop_nans, validate=False)

num_preprocessing_pipe = Pipeline(
    steps=[
        ('drop_features', drop_features),
        ("num_feature_selector", num_feature_selector),
        ('interiorAttributesAdder', num_interiorAttributesAdder),
        ('rebuildYearAttributesAdder', num_rebuildYearAttributesAdder),
        ('datetimeAttributesAdder', num_datetimeAttributesAdder),   
        ('paymentAttributesAdder', num_paymentAttributesAdder),
        ('num_outlier_remover', num_outlier_remover),
        ('StandardScalercustom', num_StandardScalercustom)   
    ]
)

cat_preprocessing_pipe = Pipeline(steps=[
    ('drop_features', drop_features),
    ("cat_feature_selector", cat_feature_selector),
    #('itemAttributesAdder', cat_itemAttributesAdder),    
    ("postalAttributesAdder", cat_postalAttributesAdder),
    ("energyAttributesAdder", cat_energyAttributesAdder),
    ("usageAttributesAdder", cat_usageAttributesAdder),
    ("rnfbAttributesAdder", cat_rnfbAttributesAdder),
    #("cat_most_frequent_imputer", cat_most_frequent_imputer) #Trying without
    
])

combined_preprocessing = FeatureUnion([
    ('numericals', num_preprocessing_pipe),
    ('categoricals', cat_preprocessing_pipe)
])

num_features_col = list(num_preprocessing_pipe.fit_transform(strat_train_set).columns)
cat_features_col = list(cat_preprocessing_pipe.fit_transform(strat_train_set).columns)
Make_dataframe = MakeToDataFrame(num_features_col + cat_features_col)

complete_pipeline = Pipeline([
    ('preprocessing', combined_preprocessing),
    ('complete_drop_nans', complete_drop_nans),
    ('DataFrame_maker', Make_dataframe),
    ('ohe_rare_encoder', rare_encoder),
    ("cat_OneHotEncodercustom", cat_OneHotEncodercustom),
    ("cat_OrdinalEncodercustom", cat_OrdinalEncodercustom),
    ('To_make_array', complete_drop_nans)
    
])

housing_data_pre_pipe = complete_pipeline.fit_transform(strat_train_set)

In [20]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [23]:
Find_salePrice = 0 #It is the first because of how custom std scaler works

reg_data = complete_pipeline.transform(strat_train_set)

pre_y = reg_data[:,Find_salePrice].astype(float)
pre_X = np.delete(reg_data, obj = Find_salePrice , axis = 1)



lin_reg = LinearRegression()
lin_reg.fit(pre_X, pre_y)


housing_predictions = lin_reg.predict(pre_X)
lin_mse = mean_squared_error(pre_y, housing_predictions)

lin_rmse = np.sqrt(lin_mse)
print("Predictions:", lin_reg.predict(pre_X))
print("Label-values:", pre_y)
print("Linear rmse:", lin_rmse)
print("Absolute loss:", mean_absolute_error(pre_y, housing_predictions))
print("Relative loss:", mean_absolute_percentage_error(pre_y, lin_reg.predict(pre_X)))

Predictions: [2154452.30783213 3554293.41718087 5433388.22905581 ... 5829611.5385097
 1299476.03752258 5568912.03494655]
Label-values: [1685000. 3625000. 5900000. ... 5000000. 1545000. 5300000.]
Linear rmse: 620115.9670585608
Absolute loss: 427350.1109731169
Relative loss: 0.11584419875601101
