In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.preprocessing import RobustScaler
from category_encoders.target_encoder import TargetEncoder

In [None]:
numeric_columns_impute = ['LotFrontage','GarageCars','GarageArea']
numeric_columns_impute_zero = ['GarageYrBlt','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']
class DataLoader():
    def __init__(self,train_path,test_path):
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        train = self.__remove_outliers(train)
        self.X_train = train.drop(['Id','SalePrice'], axis = 1)
        self.X_train['MSSubClass'] = self.X_train['MSSubClass'].astype(str)
        self.X_train['MoSoldStr'] = self.X_train['MoSold'].astype(str)
        self.X_train['YrSoldStr'] = self.X_train['YrSold'].astype(str)
        self.Y_train = train['SalePrice']
        self.X_test = test.drop(['Id'], axis = 1)
        self.X_test['MSSubClass'] = self.X_test['MSSubClass'].astype(str)
        self.X_test['MoSoldStr'] = self.X_test['MoSold'].astype(str)
        self.X_test['YrSoldStr'] = self.X_test['YrSold'].astype(str)
        self.test_Id = test['Id']
        #self.X_train, self.X_test = self.impute_granually(self.X_train, self.X_test)
        self.X_train, self.X_test = self.impute_numeric(self.X_train, self.X_test,numeric_columns_impute)
        self.X_train, self.X_test = self.impute_numeric_zero(self.X_train, self.X_test,numeric_columns_impute_zero)
        self.X_train = self.feature_engineering(self.X_train)
        self.X_test = self.feature_engineering(self.X_test)
        self.X_train, self.X_test = self.remove_skewness(self.X_train, self.X_test)
    def getX_train(self):
        return self.X_train
    def getX_test(self):
        return self.X_test
    def getY_train(self):
        return self.Y_train
    def get_Test_id(self):
        return self.test_Id
    def __remove_outliers(self,train):
        train = train[train.GrLivArea < 4500]
        train.reset_index(drop=True, inplace=True)
        outliers = [30, 88, 462, 631, 1322]
        train = train.drop(train.index[outliers])
        train.reset_index(drop=True, inplace=True)
        return train
    def remove_skewness(self,x_train,x_test):
        x = pd.concat([x_train,x_test])
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics = []
        for i in x.columns:
            if x[i].dtype in numeric_dtypes:
                numerics.append(i)
        skew_features = x[numerics].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
        high_skew = skew_features[skew_features > 0.5]
        skew_index = high_skew.index
        for i in skew_index:
            x[i] = boxcox1p(x[i], boxcox_normmax(x[i].dropna() + 1))
        x_train_res = x.iloc[:x_train.shape[0]]
        x_test_res = x.iloc[x_train.shape[0]:]
        return (x_train_res,x_test_res)
    def impute_numeric(self,x_train,x_test,cols):
        for i in cols:
            agg = x_train[i].median()
            x_train[i].fillna(agg, inplace=True)
            x_test[i].fillna(agg, inplace=True)
        return x_train,x_test
    def impute_numeric_zero(self,x_train,x_test,cols):
        for i in cols:
            x_train[i].fillna(0, inplace=True)
            x_test[i].fillna(0, inplace=True)
        return x_train,x_test
    def __impute_cats_group(self,cols,features):
        for col in cols:
            cols_exclude = cols[:]
            cols_exclude.remove(col)
            features.loc[(features[cols_exclude].notnull().apply(lambda x: np.sum(x) > 0,axis = 1)),col] = features[(features[cols_exclude].notnull().apply(lambda x: np.sum(x) > 0,axis = 1))][col].fillna(features[col].mode()[0])
        return features
    def impute_granually(self,x_train,x_test):
        features = pd.concat([x_train,x_test])
        
        
        features['Functional'] = features['Functional'].fillna('Typ')
        features.loc[features['GarageType'].notnull(),'GarageFinish'] = features[features['GarageType'].notnull()]['GarageFinish'].fillna(features['GarageFinish'].mode()[0])
        features.loc[features['GarageType'].notnull(),'GarageQual'] = features[features['GarageType'].notnull()]['GarageQual'].fillna(features['GarageQual'].mode()[0])
        features.loc[features['GarageType'].notnull(),'GarageCond'] = features[features['GarageType'].notnull()]['GarageCond'].fillna(features['GarageCond'].mode()[0])
        features.loc[features['GarageType'].notnull(),'GarageYrBlt'] = features[features['GarageType'].notnull()]['GarageYrBlt'].fillna(features['GarageYrBlt'].mode()[0])
        
        features.loc[(features['GarageYrBlt'] == 2207),'GarageYrBlt'] = 2007
        
        features = self.__impute_cats_group(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'],features)

        features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
        
        x_train_res = features.iloc[:x_train.shape[0]]
        x_test_res = features.iloc[x_train.shape[0]:]
        return (x_train_res,x_test_res)
            
    def feature_engineering(self,features):
        #features['YearsSinceRemodel'] = features['YrSold'] - features['YearRemodAdd']
        features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
        features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
        features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])
        features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))
        features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])
        features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
        features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        
        
        return features
        
        

In [None]:
class ColumnDrop(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(self.columns,axis = 1)
class FeatureNormalizer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics = []
        for i in X.columns:
            if X[i].dtype in numeric_dtypes:
                numerics.append(i)
        skew_features = X[numerics].apply(lambda x: skew(x)).sort_values(ascending=False)
        high_skew = skew_features[skew_features > 0.5]
        self.skew_index = high_skew.index
        return self
    def transform(self, X):
        for i in self.skew_index:
            X[i] = boxcox1p(X[i], boxcox_normmax(X[i].dropna() + 1))
        return X
class ToDataframeConverter(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X)      
class OverfitDrop(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        overfit = []
        for i in X.columns:
            counts = X[i].value_counts()
            zeros = counts.iloc[0]
        if zeros / len(X) * 100 > 99.94:
            overfit.append(i)
        self.overfit = list(overfit)
        return self
    def transform(self, X):
        return X.drop(self.overfit, axis=1)    

In [None]:
class TargetNormalizedRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,regressor):
        self.regressor = regressor
    def fit(self, X_train, Y_train):
        return self.regressor.fit(X_train,np.log1p(Y_train))
    def predict(self, X):
        return np.expm1(self.regressor.predict(X))
class BlendingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,regressors_and_weights):
        self.regressors_and_weights = regressors_and_weights
    def fit(self, X_train, Y_train):
        for regressor, weight in self.regressors_and_weights:
            regressor.fit(X_train,Y_train)
        return self
    def predict(self, X):
        y_pred = 0
        for regressor, weight in self.regressors_and_weights:
            y_pred += weight*regressor.predict(X)
        return y_pred  

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.impute import SimpleImputer
class PipelineWithFeatureNames(Pipeline):
    def __init__(self, steps, memory=None, verbose=False, feature_names = None):
        Pipeline.__init__(self, steps, memory,verbose)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.steps[-1][-1].get_feature_names(input_features = self.feature_names)
class SimpleImputerWithFeatureNames(SimpleImputer):
    def __init__(self, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False, feature_names = None):
        SimpleImputer.__init__(self, missing_values,strategy,fill_value,verbose,copy,add_indicator)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.feature_names
categorial_features = ['Condition2','LotShape','LandContour',
            'LotConfig','LandSlope','Neighborhood',
            'Condition1','BldgType','HouseStyle',
            'RoofStyle','RoofMatl',
            'ExterQual','ExterCond','Foundation','Heating','HeatingQC',
            'CentralAir',
            'PavedDrive','SaleCondition','MoSoldStr','YrSoldStr']
categorical_features_ordinal_impute = ["ExterQual","ExterCond","Foundation","Heating","CentralAir","Electrical","KitchenQual","Functional","PavedDrive"]

categorical_features_ordinal_const = ["BsmtCond","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageQual","GarageCond","BsmtExposure"]

ordinal_categories_impute = [['Po','Fa','TA', 'Gd', 'Ex'],['Po','Fa','TA', 'Gd', 'Ex'],['Slab','Stone','Wood','BrkTil','CBlock','PConc'],['Floor','Wall','OthW','Grav','GasW','GasA'],['N','Y'],
                            ['Mix','FuseP','FuseF','FuseA','SBrkr'],['Po','Fa','TA', 'Gd', 'Ex'],['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],['N','P','Y']]

ordinal_categories_const = [['Nan','Po','Fa','TA', 'Gd', 'Ex'],['Nan','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
                            ['Nan','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],['Nan','Po','Fa','TA', 'Gd', 'Ex'],
                            ['Nan','Detchd','CarPort','2Types','Basment','BuiltIn','Attchd'],['Nan','Po','Fa','TA', 'Gd', 'Ex'],['Nan','Po','Fa','TA', 'Gd', 'Ex'],['Nan','No','Av','Mn','Gd']]

categorial_features_ignore_unknown = ['MSSubClass']

categorial_features_with_missing_values_const = ['Alley','BsmtCond','BsmtQual','BsmtExposure','BsmtFinType1',
                                           'BsmtFinType2','FireplaceQu','GarageType','GarageQual','GarageFinish',
                                           'GarageCond','Fence','MiscFeature']
categorial_features_with_missing_values_impute = ['SaleType','Functional','KitchenQual','Exterior1st','Exterior2nd','MSZoning','MasVnrType','Electrical']
categorical_with_missing_values_pipeline_const = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="Nan")),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_const)
#for now let it be most frequent- median is not supported for numeric but seems same as frequent, but further maybe use advanced imputer that uses ml
categorical_with_missing_values_pipeline_impute = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_impute)

categorical_ordinal_pipeline_impute = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories = ordinal_categories_impute))], feature_names = categorical_features_ordinal_impute)
categorical_ordinal_pipeline_const = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value="Nan")),
    ('ordinal', OrdinalEncoder(categories = ordinal_categories_const))], feature_names = categorical_features_ordinal_const)


#categorical_pipeline = lambda: PipelineWithFeatureNames(steps=[
 #   ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features)
categorical_pipeline = lambda: PipelineWithFeatureNames(steps=[
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features)    

categorical_pipeline_ignore_unknown = lambda: PipelineWithFeatureNames(steps=[
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto',handle_unknown = 'ignore'))], feature_names = categorial_features_ignore_unknown)

X_Transformer_col = lambda : ColumnTransformer(transformers = [
    ('categorical_with_missing_values_const', categorical_with_missing_values_pipeline_const(),categorial_features_with_missing_values_const),
    ('categorical_with_missing_values_impute', categorical_with_missing_values_pipeline_impute(),categorial_features_with_missing_values_impute),
    ('categorical_ordinal_impute', categorical_ordinal_pipeline_impute(),categorical_features_ordinal_impute),
    ('categorical_ordinal_const', categorical_ordinal_pipeline_const(),categorical_features_ordinal_const),
    ('categorical',categorical_pipeline(),categorial_features),
    ('categorical_ignore_unknown',categorical_pipeline_ignore_unknown(),categorial_features_ignore_unknown),
    ('numeric_with_missing_values_impute',SimpleImputerWithFeatureNames(strategy='median',feature_names = numeric_columns_impute),numeric_columns_impute),
    ('numeric_with_missing_values_impute_zero',SimpleImputerWithFeatureNames(strategy='constant',fill_value = 0,feature_names = numeric_columns_impute_zero),numeric_columns_impute_zero)],remainder = "passthrough")

X_Transformer = lambda: Pipeline(steps = [('col_drop',ColumnDrop(['Utilities','Street','PoolQC'])),('col_transformer',X_Transformer_col()),('to_df',ToDataframeConverter()),('overfit_drop',OverfitDrop())])
X_Transformer_scaled = lambda : Pipeline(steps = [('col_drop',ColumnDrop(['Utilities','Street','PoolQC'])),('col_transformer',X_Transformer_col()),('to_df',ToDataframeConverter()),('overfit_drop',OverfitDrop()),('scaler',RobustScaler())])

In [None]:
X_Transformer_scaled()