### Feature Transformation
#### What it is :
- Changing feature distribution to make it more model-friendly.

### When needed

- Skewed data
- Long-tail distributions
- Large value ranges

In [2]:
import pandas as pd

In [3]:
import numpy as np

df = pd.DataFrame({
    "income": [20000, 30000, 500000, 1000000]
})

df["log_income"] = np.log(df["income"])
df

Unnamed: 0,income,log_income
0,20000,9.903488
1,30000,10.308953
2,500000,13.122363
3,1000000,13.815511


In [None]:
# Square root transformation
df["sqrt_income"] = np.sqrt(df["income"])
df

Unnamed: 0,income,log_income,sqrt_income
0,20000,9.903488,141.421356
1,30000,10.308953,173.205081
2,500000,13.122363,707.106781
3,1000000,13.815511,1000.0


In [7]:
# Box-Cox (only positive values)
from scipy.stats import boxcox

df["boxcox_income"], _ = boxcox(df["income"])
df

Unnamed: 0,income,log_income,sqrt_income,boxcox_income
0,20000,9.903488,141.421356,8.573251
1,30000,10.308953,173.205081,8.873083
2,500000,13.122363,707.106781,10.856473
3,1000000,13.815511,1000.0,11.32007


In [9]:
# Yeo-Johnson (allows zero & negative)
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method="yeo-johnson")
df["yj_income"] = pt.fit_transform(df[["income"]])
df


Unnamed: 0,income,log_income,sqrt_income,boxcox_income,yj_income
0,20000,9.903488,141.421356,8.573251,-1.111726
1,30000,10.308953,173.205081,8.873083,-0.861571
2,500000,13.122363,707.106781,10.856473,0.793249
3,1000000,13.815511,1000.0,11.32007,1.180049
