In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import RobustScaler, StandardScaler

In [None]:
le.inverse_transform([2, 2, 1])

# preprocessing

In [None]:
"""
StandardScaler 在異常值存在的情況下無法保證平衡的特徵尺度，
並且StandardScaler，MinMaxScaler對異常值的存在非常敏感。MaxAbsScaler絕對值對映在[0,1]範圍內，在僅有正資料時，
MaxAbsScaler與MinMaxScaler類似。
RobustScaler縮放器的居中和縮放統計資料基於百分位數，因此不受少量非常大的邊際異常值的影響，變換後的特徵值的結果範圍大於之前的縮放器
。Normalizer將樣本單獨歸一化為單位範數，與樣本的分佈無關。具體方法需通過衡量資料樣本分佈、及業務需求，最終採取一個合適的縮放器。
"""

In [None]:
class skew_andDummies(BaseEstimator, TransformerMixin):
    def __init__(self,skew=0.5):
        self.skew = skew
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X_numeric=X.select_dtypes(exclude=["object"])
        skewness = X_numeric.apply(lambda x: skew(x))
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        X = pd.get_dummies(X)
        return X

In [1]:
class label_object_cols(BaseEstimator, TransformerMixin):
    def __init__(self,skew=0.5):
        self.le = LabelEncoder()
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        for col in X:
            if X[col].dtype == 'object':
                if len(list(X[col].unique()))<=2:
                    self.le.fit(X[col])
                    X[col]=self.le.transform(X[col])
        return X

In [None]:
# build pipeline
#scaler = RobustScaler()
pipe = Pipeline([
    ('label_object_cols', label_object_cols()),
    ('skew_andDummies', skew_andDummies(skew=1)),
    ('scaler', StandardScaler()),
    ])

# PCA

In [None]:
pca = PCA(n_components=410)
X_scaled=pca.fit_transform(X_scaled)
test_X_scaled = pca.transform(test_X_scaled)

# groupby , fillna

In [None]:
full.groupby(['Neighborhood'])[['LotFrontage']].agg(['mean','median','count'])

In [None]:
full['LotFrontage']=full.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [None]:
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
    full[col].fillna(full[col].mode()[0], inplace=True)

# 缺失值填充  注意要fit 訓練及

In [None]:
# Create an imputer object with a median filling strategy
imputer = Imputer(strategy='median')

# Train on the training features
imputer.fit(train_features)

# Transform both training data and testing data
X = imputer.transform(train_features)
X_test = imputer.transform(test_features)