In [None]:
# Feature Engineering
Practical demonstration of feature creation, selection, and extraction techniques.


In [1]:
#Import LIB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression


In [2]:
data = {
    "Experience": [1, 2, 3, 4, 5],
    "Education_Level": [1, 2, 2, 3, 3],
    "Salary": [30000, 40000, 50000, 65000, 80000]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Experience,Education_Level,Salary
0,1,1,30000
1,2,2,40000
2,3,2,50000
3,4,3,65000
4,5,3,80000


In [3]:
#Feature Creation
X = df[["Experience"]]
y = df["Salary"]

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

X_poly


array([[ 1.,  1.],
       [ 2.,  4.],
       [ 3.,  9.],
       [ 4., 16.],
       [ 5., 25.]])

In [4]:
#Feature Selection
selector = SelectKBest(score_func=f_regression, k=1)
X_selected = selector.fit_transform(X, y)

X_selected


array([[1],
       [2],
       [3],
       [4],
       [5]])

In [5]:
#Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop("Salary", axis=1))


In [6]:
#Feature Extraction:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

X_pca


array([[-2.13389342,  0.13389342],
       [-0.68898224, -0.31101776],
       [-0.18898224,  0.18898224],
       [ 1.25592895, -0.25592895],
       [ 1.75592895,  0.24407105]])

In [7]:
pca.explained_variance_ratio_


array([0.97245559, 0.02754441])

In [None]:
## Key Takeaways
- Features control model performance
- Always validate engineered features
- PCA reduces dimensionality but reduces interpretability
- Feature engineering is iterative
