# 🧠 Advanced Feature Engineering Notebook
Generated on 2025-04-18 04:32

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
np.random.seed(42)

## 1️⃣ Sample Dataset

In [None]:
df = pd.DataFrame({
    'Age': np.random.randint(18, 70, 100),
    'Income': np.random.normal(50000, 15000, 100).astype(int),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Department': np.random.choice(['ICU', 'Surgery', 'Ortho'], 100),
    'Purchased': np.random.choice([0, 1], 100),
    'Text': np.random.choice(['great product', 'bad service', 'excellent', 'not good'], 100)
})
df.loc[::10, 'Age'] = np.nan
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df = pd.concat([df, pd.get_dummies(df['Department'], drop_first=True)], axis=1)
df['Income_scaled'] = StandardScaler().fit_transform(df[['Income']])
df.head()

## 2️⃣ Remove Low-Variance Features

In [None]:
selector = VarianceThreshold(threshold=0.01)
reduced = selector.fit_transform(df[['Age', 'Income_scaled', 'Gender']])
print("Remaining columns:", selector.get_feature_names_out(['Age', 'Income_scaled', 'Gender']))

## 3️⃣ Correlation Matrix

In [None]:
import seaborn as sns
corr = df.corr(numeric_only=True)
plt.figure(figsize=(8, 5))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

## 4️⃣ Feature Selection using Mutual Information

In [None]:
X = df[['Age', 'Income_scaled', 'Gender']]
y = df['Purchased']
mi = mutual_info_classif(X, y, discrete_features='auto')
for col, val in zip(X.columns, mi):
    print(f"{col}: MI score = {val:.4f}")

## 5️⃣ Feature Selection using ANOVA F-Score

In [None]:
f_scores = f_classif(X, y)[0]
for col, score in zip(X.columns, f_scores):
    print(f"{col}: F-score = {score:.4f}")

## 6️⃣ Feature Importance from Random Forest

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
importances = model.feature_importances_
for col, score in zip(X.columns, importances):
    print(f"{col}: Importance = {score:.4f}")

## 7️⃣ Polynomial Feature Generation

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df[['Age', 'Income_scaled']])
print("Polynomial features shape:", X_poly.shape)

## 8️⃣ Optional: Automated Feature Engineering with FeatureTools

In [None]:
# !pip install featuretools  # Uncomment if not installed
# import featuretools as ft
# es = ft.EntitySet(id='dataset')
# es = es.add_dataframe(dataframe_name='df', dataframe=df.reset_index(), index='index')
# feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='df')
# feature_matrix.head()