In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder


In [2]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer


In [3]:
train_ds=pd.read_csv('./train.csv')
test_ds=pd.read_csv('./test.csv')

In [4]:
train_ds=train_ds.drop(columns='Id',axis=1)

train_cols=train_ds.columns
test_cols=test_ds.columns



In [5]:
for n in train_cols:
    if train_ds[n].isna().sum()>0:
        print(n,"->",train_ds[n].value_counts().sum(), "-",train_ds[n].dtypes )


LotFrontage -> 1201 - float64
Alley -> 91 - object
MasVnrType -> 588 - object
MasVnrArea -> 1452 - float64
BsmtQual -> 1423 - object
BsmtCond -> 1423 - object
BsmtExposure -> 1422 - object
BsmtFinType1 -> 1423 - object
BsmtFinType2 -> 1422 - object
Electrical -> 1459 - object
FireplaceQu -> 770 - object
GarageType -> 1379 - object
GarageYrBlt -> 1379 - float64
GarageFinish -> 1379 - object
GarageQual -> 1379 - object
GarageCond -> 1379 - object
PoolQC -> 7 - object
Fence -> 281 - object
MiscFeature -> 54 - object


In [6]:
categorical_cols=train_ds.select_dtypes(include=['object','category']).columns
numeric_cols=train_ds.select_dtypes(include=['float64','int64']).columns

categorical_cols

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [7]:
for m in train_cols:
    if( train_ds[m].value_counts().sum()<10):
        print(m)

PoolQC


In [8]:
target=train_ds['SalePrice']
train_ds=train_ds.drop(columns='SalePrice',axis=1)
target.shape

(1460,)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(train_ds, target, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 79), (292, 79), (1168,), (292,))

## **Class**

class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols =cols
        print(cols)
        self.mapping = {}
    
    def fit(self, X, y):
        for col in self.cols:
            self.mapping[col] = X.groupby(col)[y.name].mean()
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for col in self.cols:
            X_encoded[col] = X_encoded[col].map(self.mapping[col])
            # Handle missing values by filling with overall mean
            X_encoded[col].fillna(X_encoded[col].mean(), inplace=True)
        return X_encoded

## **Pipeline**

plt.figure(figsize=(4, 2))
for n in train_cols:
    sns.scatterplot(x=train_ds[n], y=train_ds['BsmtQual'])
    plt.title(f"Relationship between missing values in BsmtQual and {n}")
    plt.xlabel(n)
    plt.ylabel(f"Missing values in BsmtQual")
    plt.show()

In [10]:
train_ds['MasVnrType'].value_counts(),train_ds['MasVnrType'].value_counts().sum()

(MasVnrType
 BrkFace    445
 Stone      128
 BrkCmn      15
 Name: count, dtype: int64,
 588)

In [11]:
col=train_ds.columns[[4,8,5]].values
col


array(['Street', 'Utilities', 'Alley'], dtype=object)

In [12]:
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [13]:
trf_fillna=ColumnTransformer([
    #msmt,Alley,MasVnrType,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
    ("zero_fill", SimpleImputer(strategy='constant', fill_value=0),[29,30,31,32,34,5,24,25,56,57,58,59,62,63,71,72,73]),
    #Electrical
    ("Electrical_fill", SimpleImputer(strategy='most_frequent'),[41]),
    #LotFrontage
    ("LF_fill", SimpleImputer(strategy='most_frequent'),[2])
],remainder='passthrough')

In [14]:
trf_encoder=ColumnTransformer(transformers=[
        ('mean_encoder',TargetEncoder(), [57, 73])
        # Add other transformers for numerical columns if needed
        # ('numerical_transformer', NumericalTransformer(), ['num_col1', 'num_col2']),
    ],remainder='passthrough')

In [15]:
categorical_cols=X_train.select_dtypes(include=['object','category']).columns
print(categorical_cols,len(categorical_cols))

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object') 43


In [16]:
target.info()


<class 'pandas.core.series.Series'>
RangeIndex: 1460 entries, 0 to 1459
Series name: SalePrice
Non-Null Count  Dtype
--------------  -----
1460 non-null   int64
dtypes: int64(1)
memory usage: 11.5 KB


In [17]:
pipe=make_pipeline(trf_fillna,trf_encoder, verbose=True)
pipe.fit(X_train,y_train)

[Pipeline]  (step 1 of 2) Processing columntransformer-1, total=   0.0s
[Pipeline]  (step 2 of 2) Processing columntransformer-2, total=   0.0s


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [18]:
from sklearn import set_config
set_config(display='diagram')

#**If i want to use the pipeline in our dataframe** 
transformed_data = pipe.transform(train_ds)

#**Convert transformed data to a DataFrame**
transformed_df = pd.DataFrame(transformed_data, columns=train_ds.columns)

#**Print the transformed DataFrame**
print(transformed_df['Fence'].isna().sum())