# Feature Engineering Tutorial: From Raw to Refined
In this notebook, we will explore common feature engineering techniques on a synthetic dataset.

## Step 1: Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

sns.set(style='whitegrid')

## Step 2: Create a Sample Dataset

In [2]:
# Sample dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(18, 65, size=100),
    'Gender': np.random.choice(['Male', 'Female'], size=100),
    'Salary': np.random.randint(30000, 100000, size=100),
    'Department': np.random.choice(['HR', 'Tech', 'Sales'], size=100),
    'Joined': pd.date_range(start='2015-01-01', periods=100, freq='M'),
    'Purchased': np.random.choice([0, 1], size=100)
})
df['Years_at_Company'] = (pd.Timestamp('2023-01-01') - df['Joined']).dt.days // 365
df.drop(columns='Joined', inplace=True)
df.head()

  'Joined': pd.date_range(start='2015-01-01', periods=100, freq='M'),


Unnamed: 0,Age,Gender,Salary,Department,Purchased,Years_at_Company
0,56,Female,51976,HR,1,7
1,46,Female,74262,HR,0,7
2,32,Male,53776,HR,0,7
3,60,Male,60080,Tech,1,7
4,25,Male,96842,HR,0,7


## Step 3: Encode Categorical Features

In [3]:
df_encoded = df.copy()
df_encoded['Gender'] = LabelEncoder().fit_transform(df_encoded['Gender'])
df_encoded = pd.get_dummies(df_encoded, columns=['Department'], drop_first=True)
df_encoded.head()

Unnamed: 0,Age,Gender,Salary,Purchased,Years_at_Company,Department_Sales,Department_Tech
0,56,0,51976,1,7,False,False
1,46,0,74262,0,7,False,False
2,32,1,53776,0,7,False,False
3,60,1,60080,1,7,False,True
4,25,1,96842,0,7,False,False


## Step 4: Binning and Transformations

In [4]:
# Binning Age into categories
df_encoded['Age_Bin'] = pd.cut(df_encoded['Age'], bins=[17, 25, 35, 50, 65], labels=['18-25', '26-35', '36-50', '51-65'])

# Log transform Salary
df_encoded['Log_Salary'] = np.log(df_encoded['Salary'])

df_encoded.head()

Unnamed: 0,Age,Gender,Salary,Purchased,Years_at_Company,Department_Sales,Department_Tech,Age_Bin,Log_Salary
0,56,0,51976,1,7,False,False,51-65,10.858537
1,46,0,74262,0,7,False,False,36-50,11.215355
2,32,1,53776,0,7,False,False,26-35,10.892583
3,60,1,60080,1,7,False,True,51-65,11.003432
4,25,1,96842,0,7,False,False,18-25,11.480836


## Step 5: Scaling Features

In [5]:
scaler = MinMaxScaler()
df_encoded[['Salary_Scaled', 'Years_Scaled']] = scaler.fit_transform(df_encoded[['Salary', 'Years_at_Company']])
df_encoded.head()

Unnamed: 0,Age,Gender,Salary,Purchased,Years_at_Company,Department_Sales,Department_Tech,Age_Bin,Log_Salary,Salary_Scaled,Years_Scaled
0,56,0,51976,1,7,False,False,51-65,10.858537,0.310682,1.0
1,46,0,74262,0,7,False,False,36-50,11.215355,0.638484,1.0
2,32,1,53776,0,7,False,False,26-35,10.892583,0.337158,1.0
3,60,1,60080,1,7,False,True,51-65,11.003432,0.429883,1.0
4,25,1,96842,0,7,False,False,18-25,11.480836,0.970612,1.0


## Step 6: Polynomial Features

In [6]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df_encoded[['Salary_Scaled', 'Years_Scaled']])
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(['Salary_Scaled', 'Years_Scaled']))
df_final = pd.concat([df_encoded.reset_index(drop=True), poly_df], axis=1)
df_final.head()

Unnamed: 0,Age,Gender,Salary,Purchased,Years_at_Company,Department_Sales,Department_Tech,Age_Bin,Log_Salary,Salary_Scaled,Years_Scaled,Salary_Scaled.1,Years_Scaled.1,Salary_Scaled^2,Salary_Scaled Years_Scaled,Years_Scaled^2
0,56,0,51976,1,7,False,False,51-65,10.858537,0.310682,1.0,0.310682,1.0,0.096523,0.310682,1.0
1,46,0,74262,0,7,False,False,36-50,11.215355,0.638484,1.0,0.638484,1.0,0.407662,0.638484,1.0
2,32,1,53776,0,7,False,False,26-35,10.892583,0.337158,1.0,0.337158,1.0,0.113675,0.337158,1.0
3,60,1,60080,1,7,False,True,51-65,11.003432,0.429883,1.0,0.429883,1.0,0.184799,0.429883,1.0
4,25,1,96842,0,7,False,False,18-25,11.480836,0.970612,1.0,0.970612,1.0,0.942087,0.970612,1.0


## Step 7: Model Performance Before vs After Feature Engineering

In [7]:
# Model using raw features
X_raw = df[['Age', 'Salary', 'Years_at_Company']]
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.3, random_state=42)

raw_model = LogisticRegression()
raw_model.fit(X_train, y_train)
y_pred_raw = raw_model.predict(X_test)
print("Accuracy with Raw Features:", accuracy_score(y_test, y_pred_raw))

Accuracy with Raw Features: 0.4666666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Model using engineered features
X_eng = df_final.select_dtypes(include=[np.number]).drop(columns=['Purchased'])
X_train_eng, X_test_eng, y_train, y_test = train_test_split(X_eng, y, test_size=0.3, random_state=42)

eng_model = LogisticRegression(max_iter=1000)
eng_model.fit(X_train_eng, y_train)
y_pred_eng = eng_model.predict(X_test_eng)
print("Accuracy with Engineered Features:", accuracy_score(y_test, y_pred_eng))

Accuracy with Engineered Features: 0.4666666666666667
