In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p

In [None]:
def cats_by_distribution_col(df,x_col,y_col):
    return df.groupby([x_col])[x_col,y_col].aggregate(np.median).reset_index().sort_values(y_col)[x_col].values.tolist()
def cats_by_distribution(df,x_cols,y_col):
    return [cats_by_distribution_col(df,x_col,y_col) for x_col in x_cols]

In [None]:
drop1 = ['Street','LotShape','LandContour',
            'LotConfig','LandSlope','Neighborhood',
            'Condition1','Condition2','BldgType','HouseStyle',
            'RoofStyle','RoofMatl',
            'ExterQual','ExterCond','Foundation','Heating','HeatingQC',
            'CentralAir',
            'PavedDrive','SaleCondition']
drop2 = ['Alley','BsmtCond','BsmtQual','BsmtExposure','BsmtFinType1',
                                           'BsmtFinType2','FireplaceQu','GarageType','GarageQual','GarageFinish',
                                           'GarageCond','PoolQC','Fence','MiscFeature']
drop3 = ['SaleType','Functional','KitchenQual','Exterior1st','Exterior2nd','MSZoning','MasVnrType','Electrical']
class DataLoader():
    def __init__(self,path):
        self.df = pd.read_csv(path)
        outliers = [30, 88, 462, 631, 1322]
        self.df = self.df.drop(self.df.index[outliers])
        self.X = self.df.drop(['Id','SalePrice','Utilities', 'Street', 'PoolQC','Alley'], axis = 1)
        #self.X['LotFrontage'] = self.X.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        self.Y = self.df['SalePrice']
    def getX(self):
        return self.X
    def getY(self):
        return self.Y

In [None]:
train_data = pd.read_csv('train.csv')

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.impute import SimpleImputer
class PipelineWithFeatureNames(Pipeline):
    def __init__(self, steps, memory=None, verbose=False, feature_names = None):
        Pipeline.__init__(self, steps, memory,verbose)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.steps[-1][-1].get_feature_names(input_features = self.feature_names)
class SimpleImputerWithFeatureNames(SimpleImputer):
    def __init__(self, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False, feature_names = None):
        SimpleImputer.__init__(self, missing_values,strategy,fill_value,verbose,copy,add_indicator)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.feature_names
categorial_features = ['LotShape','LandContour',
            'LotConfig','LandSlope','Neighborhood',
            'Condition1','Condition2','BldgType','HouseStyle',
            'RoofStyle','RoofMatl',
            'ExterQual','ExterCond','Foundation','Heating','HeatingQC',
            'CentralAir',
            'PavedDrive','SaleCondition']
categorial_features_ordinal = ['Neighborhood']


categorial_features_ordinal_with_missing_values_impute = []

nbhd_cats = ['MeadowV','IDOTRR','BrDale','OldTown','Edwards',
               'BrkSide','Sawyer','Blueste','SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes',
               'Blmngtn','CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt']
mssubclass_cats = ['180','30','45','190','50','90','85','40','160','150','70','20','75','80','120','60']

categorial_features_with_missing_values_const = ['BsmtCond','BsmtQual','BsmtExposure','BsmtFinType1',
                                           'BsmtFinType2','FireplaceQu','GarageType','GarageQual','GarageFinish',
                                           'GarageCond','Fence','MiscFeature']
categorial_features_with_missing_values_impute = ['SaleType','Functional','KitchenQual','Exterior1st','Exterior2nd','MSZoning','MasVnrType','Electrical']
numeric_columns_impute = ['GarageArea','GarageCars','BsmtHalfBath','BsmtFullBath','TotalBsmtSF','BsmtUnfSF','BsmtFinSF1','BsmtFinSF2','MasVnrArea','GarageYrBlt']
#numeric_columns_impute = []
categorical_with_missing_values_pipeline_const = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="Nan")),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_const)
#for now let it be most frequent- median is not supported for numeric but seems same as frequent, but further maybe use advanced imputer that uses ml
categorical_with_missing_values_pipeline_impute = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features_with_missing_values_impute)

categorical_ordinal_with_missing_values_pipeline_impute = lambda: PipelineWithFeatureNames(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('ordinal', OrdinalEncoder(dtype=int,categories = cats_by_distribution(train_data,categorial_features_ordinal_with_missing_values_impute,'SalePrice')))], feature_names = categorial_features_ordinal_with_missing_values_impute)

categorical_pipeline = lambda: PipelineWithFeatureNames(steps=[
    ('onehot', OneHotEncoder(sparse=False,dtype=int,categories = 'auto'))], feature_names = categorial_features)
X_Transformer = lambda : ColumnTransformer(transformers = [
    ('categorical_with_missing_values_const', categorical_with_missing_values_pipeline_const(),categorial_features_with_missing_values_const),
    ('categorical_with_missing_values_impute', categorical_with_missing_values_pipeline_impute(),categorial_features_with_missing_values_impute),
    ('categorical',categorical_pipeline(),categorial_features),
    ('numeric_with_missing_values_impute',SimpleImputerWithFeatureNames(strategy='median',feature_names = numeric_columns_impute),numeric_columns_impute)],remainder = "passthrough")