# Transformers - Composite Transformers

In [1]:
import numpy as np

In [2]:
data = np.array([[20.0, 'male'], [11.2, 'female'], [15.6, 'female'], [13.0, 'male'], [18.6, 'male'], [16.4, 'female']])
print(data)

[['20.0' 'male']
 ['11.2' 'female']
 ['15.6' 'female']
 ['13.0' 'male']
 ['18.6' 'male']
 ['16.4' 'female']]


In [4]:
#Column Transformer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder

column_trans = ColumnTransformer(
    [
        ('ageScaler', MaxAbsScaler(), [0]),
        ('genderEnoder', OneHotEncoder(dtype='int'), [1])
    ], 
    remainder= 'drop', 
    verbose_feature_names_out= True
)

column_transformed_data = column_trans.fit_transform(data)

print(column_transformed_data)


[[1.   0.   1.  ]
 [0.56 1.   0.  ]
 [0.78 1.   0.  ]
 [0.65 0.   1.  ]
 [0.93 0.   1.  ]
 [0.82 1.   0.  ]]


In [5]:
# Transformed Target Regressor

from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

tt = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp)

# X = np.arange(4).reshape(-1, 1)
# y = np.exp(2 * X).ravel()
# tt.fit()

In [6]:
# Chaining Transformers

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# It is important to implement them in same order
# Therefore we have pipeline and feature union

si = SimpleImputer()
# x_imputed = si.fit_transform(x)
ss = StandardScaler()
# x_scaled = ss.fit_transform(x_imputed)


In [7]:
from sklearn.pipeline import Pipeline
estimators = [
    ('simpleImputer', SimpleImputer()),
    ('standardScaler', StandardScaler())
]

pipe = Pipeline(steps=estimators)
# pip.fit_transform(x)

In [11]:
from sklearn.decomposition import PCA
estimators = [
    ('SimpleImputer', SimpleImputer()),
    ('pca', PCA()),
    ('regressor', LinearRegression())
]
pipe = Pipeline(steps=estimators)

# pipe.named_steps.pca

pipe.set_params(pca__n_components = 2)

## Tutorial

In [13]:
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np
import ssl
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
context = ssl._create_unverified_context()
data = urllib.request.urlopen(url, context=context)
df = pd.read_csv(data, sep=';')

In [15]:
df.describe

<bound method NDFrame.describe of       fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0               7.4             0.700         0.00  ...       0.56      9.4        5
1               7.8             0.880         0.00  ...       0.68      9.8        5
2               7.8             0.760         0.04  ...       0.65      9.8        5
3              11.2             0.280         0.56  ...       0.58      9.8        6
4               7.4             0.700         0.00  ...       0.56      9.4        5
...             ...               ...          ...  ...        ...      ...      ...
1594            6.2             0.600         0.08  ...       0.58     10.5        5
1595            5.9             0.550         0.10  ...       0.76     11.2        6
1596            6.3             0.510         0.13  ...       0.75     11.0        6
1597            5.9             0.645         0.12  ...       0.71     10.2        5
1598            6.0            

In [16]:
wine_data = df

In [19]:
wine_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [20]:
transformer = FunctionTransformer(np.log1p, validate=True)
wine_data_transformed = transformer.transform(np.array(wine_data))
pd.DataFrame(wine_data_transformed, columns=wine_data.columns).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,2.215842,0.1781,1.722767,2.091864,2.186051,2.322388,2.827314
volatile acidity,1599.0,0.417173,0.114926,0.113329,0.329304,0.41871,0.494696,0.947789
citric acid,1599.0,0.228147,0.152423,0.0,0.086178,0.231112,0.350657,0.693147
residual sugar,1599.0,1.218131,0.269969,0.641854,1.064711,1.163151,1.280934,2.80336
chlorides,1599.0,0.083038,0.038991,0.011929,0.067659,0.076035,0.086178,0.476855
free sulfur dioxide,1599.0,2.639013,0.62379,0.693147,2.079442,2.70805,3.091042,4.290459
total sulfur dioxide,1599.0,3.63475,0.682575,1.94591,3.135494,3.663562,4.143135,5.669881
density,1599.0,0.691519,0.000945,0.68817,0.690945,0.691521,0.692064,0.69499
pH,1599.0,1.460557,0.03576,1.319086,1.437463,1.460938,1.481605,1.611436
sulphates,1599.0,0.501073,0.093731,0.285179,0.438255,0.482426,0.548121,1.098612


In [21]:
# Polynomial Features

from sklearn.preprocessing import PolynomialFeatures

wine_data_copy = wine_data.copy()
wine_data = wine_data.drop(['quality'], axis=1)
print(wine_data.shape)

(1599, 11)


In [22]:
poly = PolynomialFeatures(degree=2)
poly_winw_data = poly.fit_transform(wine_data)
print(poly_winw_data.shape)

(1599, 78)


In [24]:
poly.get_feature_names_out()

array(['1', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'fixed acidity^2', 'fixed acidity volatile acidity',
       'fixed acidity citric acid', 'fixed acidity residual sugar',
       'fixed acidity chlorides', 'fixed acidity free sulfur dioxide',
       'fixed acidity total sulfur dioxide', 'fixed acidity density',
       'fixed acidity pH', 'fixed acidity sulphates',
       'fixed acidity alcohol', 'volatile acidity^2',
       'volatile acidity citric acid', 'volatile acidity residual sugar',
       'volatile acidity chlorides',
       'volatile acidity free sulfur dioxide',
       'volatile acidity total sulfur dioxide',
       'volatile acidity density', 'volatile acidity pH',
       'volatile acidity sulphates', 'volatile acidity alcohol',
       'citric acid^2', 'citric acid residual sugar',
       'citric acid chlorides', 'citric aci