In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.preprocessing import RobustScaler
from category_encoders.target_encoder import TargetEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:

class DataLoader():
    def __init__(self,train_path,test_path):
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        train = self.__remove_outliers(train)
        self.X_train = train.drop(['Id','SalePrice'], axis = 1)
        self.X_train['MSSubClass'] = self.X_train['MSSubClass'].astype(str)
        self.Y_train = train['SalePrice']
        self.X_test = test.drop(['Id'], axis = 1)
        self.X_test['MSSubClass'] = self.X_test['MSSubClass'].astype(str)
        self.test_Id = test['Id']
        self.X_train = self.feature_engineering(self.X_train)
        self.X_test = self.feature_engineering(self.X_test)
        self.X_train, self.X_test = self.remove_skewness(self.X_train, self.X_test)
        #self.X_train, self.X_test = self.add_polynomials(self.X_train, self.X_test)
    def getX_train(self):
        return self.X_train
    def getX_test(self):
        return self.X_test
    def getY_train(self):
        return self.Y_train
    def get_Test_id(self):
        return self.test_Id
    def __remove_outliers(self,train):
        train = train[train.GrLivArea < 4500]
        train.reset_index(drop=True, inplace=True)
        outliers = [30, 88, 462, 631, 1322]
        train = train.drop(train.index[outliers])
        train.reset_index(drop=True, inplace=True)
        return train
    def remove_skewness(self,x_train,x_test):
        x = pd.concat([x_train,x_test])
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics = []
        for i in x.columns:
            if x[i].dtype in numeric_dtypes:
                numerics.append(i)
        skew_features = x[numerics].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
        high_skew = skew_features[skew_features > 0.5]
        skew_index = high_skew.index
        for i in skew_index:
            x[i] = boxcox1p(x[i], boxcox_normmax(x[i].dropna() + 1))
        x_train_res = x.iloc[:x_train.shape[0]]
        x_test_res = x.iloc[x_train.shape[0]:]
        return (x_train_res,x_test_res)
    def add_polynomials(self,x_train,x_test):
        x = pd.concat([x_train,x_test])
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics = []
        for i in x.columns:
            if x[i].dtype in numeric_dtypes:
                x[i+'square'] = x[i]*x[i]
        x_train_res = x.iloc[:x_train.shape[0]]
        x_test_res = x.iloc[x_train.shape[0]:]
        return (x_train_res,x_test_res)

            
    def feature_engineering(self,features):
        features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
        features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
        features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])
        features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))
        features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])
        features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
        features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        #features['Condition1'] = features['Condition1'].apply(lambda x: x if x == "Norm" else "NotNorm")
        #features['BldgType'] = features['BldgType'].apply(lambda x: "1Fam" if (x == "1Fam" or x == "TwnhsE") else "Not1Fam")
        #features['HouseStyle1'] = features['HouseStyle'].apply(lambda x: x if (x == "1Story"or x == "2Story") else "Not12Story")
        return features
        
        

In [None]:
class ColumnDrop(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(self.columns,axis = 1)
class FeatureNormalizer(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics = []
        for i in X.columns:
            if X[i].dtype in numeric_dtypes:
                numerics.append(i)
        skew_features = X[numerics].apply(lambda x: skew(x)).sort_values(ascending=False)
        high_skew = skew_features[skew_features > 0.5]
        self.skew_index = high_skew.index
        return self
    def transform(self, X):
        for i in self.skew_index:
            X[i] = boxcox1p(X[i], boxcox_normmax(X[i].dropna() + 1))
        return X
class ToDataframeConverter(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        result = pd.DataFrame(X).infer_objects()
        return result
class OverfitDrop(BaseEstimator,TransformerMixin):
    def fit(self, X, y=None):
        overfit = []
        for i in X.columns:
            counts = X[i].value_counts()
            zeros = counts.iloc[0]
        if zeros / len(X) * 100 > 99.94:
            overfit.append(i)
        self.overfit = list(overfit)
        return self
    def transform(self, X):
        return X.drop(self.overfit, axis=1)    

In [None]:
class TargetNormalizedRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,regressor):
        self.regressor = regressor
    def fit(self, X_train, Y_train):
        return self.regressor.fit(X_train,np.log1p(Y_train))
    def predict(self, X):
        return np.expm1(self.regressor.predict(X))
class BlendingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,regressors_and_weights):
        self.regressors_and_weights = regressors_and_weights
    def fit(self, X_train, Y_train):
        for regressor, weight in self.regressors_and_weights:
            regressor.fit(X_train,Y_train)
        return self
    def predict(self, X):
        y_pred = 0
        for regressor, weight in self.regressors_and_weights:
            y_pred += weight*regressor.predict(X)
        return y_pred  

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.impute import SimpleImputer
class DfColumnTransformer(ColumnTransformer):
    def transform(self, X):
        return self.__tranform_to_categorial(pd.DataFrame(ColumnTransformer.transform(self,X),columns = self.get_feature_names()).infer_objects())
    def fit_transform(self, X, y=None):
        return self.__tranform_to_categorial(pd.DataFrame(ColumnTransformer.fit_transform(self, X, y), columns = self.get_feature_names()).infer_objects())
    def get_feature_names(self):
        feature_names = []
        for name, trans, _, _ in self._iter(fitted=True):
            if trans == 'drop':
                continue
            feature_names.extend([f for f in
                                  trans.get_feature_names()])
        return feature_names
    def __tranform_to_categorial(self,df):
        for feature in df:
            if df[feature].dtype == 'object':
                df[feature] = df[feature].astype('category')
        return df
    
class PipelineWithFeatureNames(Pipeline):
    def __init__(self, steps, memory=None, verbose=False, feature_names = None):
        Pipeline.__init__(self, steps, memory,verbose)
        self.feature_names = feature_names
    def get_feature_names(self):
        last_step = self._final_estimator
        if isinstance(last_step, OneHotEncoder) and self.feature_names:
             return last_step.get_feature_names(input_features = self.feature_names)
        else:
             return self.feature_names    
class StubTransformer(SimpleImputer):
    def __init__(self,feature_names):
        self.feature_names = feature_names
    def fit(self, X, y=None):
        return self
    def get_feature_names(self):
        return self.feature_names   
    def fit_transform(self, X, y=None):
        return X
    def transform(self, X):
        return X
class SimpleImputerWithFeatureNamesWorking(SimpleImputer):
    def __init__(self, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False, feature_names = None):
        SimpleImputer.__init__(self, missing_values,strategy,fill_value,verbose,copy,add_indicator)
        self.feature_names = feature_names
    def get_feature_names(self):
        return self.feature_names  
    
categorial_features = ['Condition2','LotShape','LandContour','LandSlope',
            'LotConfig','Neighborhood',
            'Condition1','BldgType','HouseStyle',
            'RoofStyle','RoofMatl',
            'ExterQual','ExterCond','Foundation','Heating','HeatingQC',
            'CentralAir',
            'PavedDrive','SaleCondition']




categorial_features_ignore_unknown = ['MSSubClass']

categorial_features_with_missing_values_const = ['Alley','BsmtFinType1',
                                           'FireplaceQu','Fence','MiscFeature','GarageType']


categorial_features_with_missing_values_impute = ['SaleType','Functional','KitchenQual',
                                                  'Exterior1st','Exterior2nd','MSZoning',
                                                  'MasVnrType','Electrical','BsmtQual','BsmtCond','BsmtExposure',
                                                 'BsmtFinType2',
                                                  'GarageQual','GarageFinish','GarageCond'] + categorial_features



numeric_columns_impute = ['LotFrontage','GarageArea','GarageCars','BsmtHalfBath','BsmtFullBath','TotalBsmtSF','BsmtUnfSF','BsmtFinSF1','BsmtFinSF2','MasVnrArea','GarageYrBlt',
                          'LotArea', 'OverallQual','OverallCond','YearBuilt','YearRemodAdd','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','FullBath',
                         'HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','WoodDeckSF',
                         'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold',
                         'YrSold','YrBltAndRemod','TotalSF', 'Total_sqr_footage','Total_Bathrooms','Total_porch_sf',
                         'haspool','has2ndfloor','hasgarage','hasbsmt','hasfireplace']
categorical_with_missing_values_pipeline_const = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', SimpleImputerWithFeatureNamesWorking(strategy='constant', fill_value="nan", feature_names = categorial_features_with_missing_values_const))], feature_names = categorial_features_with_missing_values_const)

#for now let it be most frequent- median is not supported for numeric but seems same as frequent, but further maybe use advanced imputer that uses ml
categorical_with_missing_values_pipeline_impute = lambda: PipelineWithFeatureNames(steps=[
    ('imputer', StubTransformer(feature_names = categorial_features_with_missing_values_impute))], feature_names = categorial_features_with_missing_values_impute)



X_Transformer_col = lambda : DfColumnTransformer(transformers = [
    ('categorical_with_missing_values_const', categorical_with_missing_values_pipeline_const(),categorial_features_with_missing_values_const),
    ('categorical_with_missing_values_impute', categorical_with_missing_values_pipeline_impute(),categorial_features_with_missing_values_impute),
    ('numeric_with_missing_values_impute',StubTransformer(feature_names = numeric_columns_impute),numeric_columns_impute)])


cat_const_remove = []
cat_impute_remove = []

cat_remove = []

X_Transformer = lambda: Pipeline(steps = [('col_drop',ColumnDrop(['Utilities','Street','PoolQC'] + cat_remove + cat_impute_remove + cat_const_remove)),('col_transformer',X_Transformer_col())])
X_Transformer_scaled = lambda : Pipeline(steps = [('col_drop',ColumnDrop(['Utilities','Street','PoolQC']  + cat_remove + cat_impute_remove + cat_const_remove)),('col_transformer',X_Transformer_col()),('overfit_drop',OverfitDrop()),('scaler',RobustScaler())])