In [14]:
import pandas as pd
import numpy as np
import ssl
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt

## Adding Dummy Features
Augemnts dataset with a column vector, each value in vector is 1. This is useful for adding a parameter for bias term in model.

In [15]:
x = np.array([
    [7, 1],
    [1, 8],
    [2, 0],
    [9, 6]
])

In [16]:
from sklearn.preprocessing import add_dummy_feature

x_new = add_dummy_feature(x)

print(x_new)

[[1. 7. 1.]
 [1. 1. 8.]
 [1. 2. 0.]
 [1. 9. 6.]]


## Custom Transformers
Enables conversion of an exsiting Python function into a transformer to assist in data clening or processing.
Useful when
1. The dataset consists of hetergenous types (eg raster images, text caption etc)
2. The dataset is stored in a pandas.Dataframes and different columns require different processing pipelines.
3. We need stateless transformations such as taking the log of frequencies, custom scaling etc


In [17]:
from sklearn.preprocessing import FunctionTransformer

In [19]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
context = ssl._create_unverified_context()
data = urllib.request.urlopen(url, context=context)
wine_data = pd.read_csv(data, sep=';')

In [20]:
wine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [21]:
wine_data.describe().T # Transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [22]:
transformer = FunctionTransformer(np.log1p, validate=True)

wine_data_transformed = transformer.transform(np.array(wine_data))

pd.DataFrame(wine_data_transformed, columns=wine_data.columns).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,2.215842,0.1781,1.722767,2.091864,2.186051,2.322388,2.827314
volatile acidity,1599.0,0.417173,0.114926,0.113329,0.329304,0.41871,0.494696,0.947789
citric acid,1599.0,0.228147,0.152423,0.0,0.086178,0.231112,0.350657,0.693147
residual sugar,1599.0,1.218131,0.269969,0.641854,1.064711,1.163151,1.280934,2.80336
chlorides,1599.0,0.083038,0.038991,0.011929,0.067659,0.076035,0.086178,0.476855
free sulfur dioxide,1599.0,2.639013,0.62379,0.693147,2.079442,2.70805,3.091042,4.290459
total sulfur dioxide,1599.0,3.63475,0.682575,1.94591,3.135494,3.663562,4.143135,5.669881
density,1599.0,0.691519,0.000945,0.68817,0.690945,0.691521,0.692064,0.69499
pH,1599.0,1.460557,0.03576,1.319086,1.437463,1.460938,1.481605,1.611436
sulphates,1599.0,0.501073,0.093731,0.285179,0.438255,0.482426,0.548121,1.098612


## Polynomial Features

Generate Polynomial Features

In [25]:
from sklearn.preprocessing import PolynomialFeatures

wine_data_copy = wine_data.copy()

print("Number of features before transformation: ", wine_data.shape)


poly = PolynomialFeatures(degree=2)
poly_wine_data = poly.fit_transform(wine_data)

print("Number of features before transformation: ", poly_wine_data.shape)

Number of features before transformation:  (1599, 12)
Number of features before transformation:  (1599, 91)


In [26]:
poly.get_feature_names_out()

array(['1', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'fixed acidity^2', 'fixed acidity volatile acidity',
       'fixed acidity citric acid', 'fixed acidity residual sugar',
       'fixed acidity chlorides', 'fixed acidity free sulfur dioxide',
       'fixed acidity total sulfur dioxide', 'fixed acidity density',
       'fixed acidity pH', 'fixed acidity sulphates',
       'fixed acidity alcohol', 'fixed acidity quality',
       'volatile acidity^2', 'volatile acidity citric acid',
       'volatile acidity residual sugar', 'volatile acidity chlorides',
       'volatile acidity free sulfur dioxide',
       'volatile acidity total sulfur dioxide',
       'volatile acidity density', 'volatile acidity pH',
       'volatile acidity sulphates', 'volatile acidity alcohol',
       'volatile acidity quality', 'citric acid^2',
       'c

## Discretization

Discretization or Quantisation or Binning provides a way to partition continuous features into discrete values.

In [28]:
from sklearn.preprocessing import KBinsDiscretizer

wine_data_copy_2 = wine_data_copy.copy()

enc = KBinsDiscretizer(n_bins=10, encode="onehot")
X = np.array(wine_data['chlorides']).reshape(-1, 1)
X_binned = enc.fit_transform(X)

In [29]:
X_binned

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1599 stored elements and shape (1599, 10)>

In [30]:
X_binned.toarray()[:10]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

## Handling Categorical Data