# 🚀 Complete Feature Engineering Workbook
Generated on 2025-04-18 04:36

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
np.random.seed(42)

## 1️⃣ Creating a Synthetic Dataset

In [None]:
df = pd.DataFrame({
    'Age': np.random.randint(18, 70, 100),
    'Income': np.random.normal(50000, 15000, 100).astype(int),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Department': np.random.choice(['ICU', 'Surgery', 'Ortho'], 100),
    'Purchased': np.random.choice([0, 1], 100),
    'Text': np.random.choice(['great product', 'bad service', 'excellent', 'not good'], 100)
})
df.head()

## 2️⃣ Handling Missing Values

In [None]:
df.loc[::10, 'Age'] = np.nan
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].mean())
df.head(10)

## 3️⃣ Label and One-Hot Encoding

In [None]:
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df = pd.concat([df, pd.get_dummies(df['Department'], drop_first=True)], axis=1)
df.head()

## 4️⃣ Feature Scaling

In [None]:
scaler = StandardScaler()
df['Income_scaled'] = scaler.fit_transform(df[['Income']])
df[['Income', 'Income_scaled']].head()

## 5️⃣ Text Vectorization - Bag of Words

In [None]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['Text'])
pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out()).head()

## 6️⃣ Text Vectorization - TF-IDF

In [None]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Text'])
pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out()).head()

## 7️⃣ Polynomial Feature Generation

In [None]:
df['Age_squared'] = df['Age'] ** 2
df[['Age', 'Age_squared']].head()

## 8️⃣ Feature Binning

In [None]:
df['Income_bin'] = pd.cut(df['Income'], bins=3, labels=['Low', 'Medium', 'High'])
df[['Income', 'Income_bin']].head()

## 9️⃣ PCA for Dimensionality Reduction

In [None]:
X = df[['Age', 'Income_scaled']].dropna()
pca = PCA(n_components=2)
pca_data = pca.fit_transform(X)
df['PC1'], df['PC2'] = pca_data[:, 0], pca_data[:, 1]

plt.figure(figsize=(6,4))
plt.scatter(df['PC1'], df['PC2'], c=df['Purchased'], cmap='coolwarm')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Components")
plt.grid(True)
plt.show()

## 🔟 Feature Interaction Terms

In [None]:
df['Age_Income_Interaction'] = df['Age'] * df['Income_scaled']
df[['Age', 'Income_scaled', 'Age_Income_Interaction']].head()

## 1️⃣1️⃣ Variance Threshold Feature Selection

In [None]:
vt = VarianceThreshold(threshold=0.01)
selected = vt.fit_transform(df[['Age', 'Income_scaled', 'Gender']])
print("Remaining columns:", vt.get_feature_names_out(['Age', 'Income_scaled', 'Gender']))

## 1️⃣2️⃣ Correlation Heatmap

In [None]:
corr = df.corr(numeric_only=True)
plt.figure(figsize=(8, 5))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

## 1️⃣3️⃣ Mutual Information

In [None]:
X = df[['Age', 'Income_scaled', 'Gender']]
y = df['Purchased']
mi = mutual_info_classif(X, y)
for feature, score in zip(X.columns, mi):
    print(f"{feature}: {score:.4f}")

## 1️⃣4️⃣ ANOVA F-Score

In [None]:
f_scores = f_classif(X, y)[0]
for feature, score in zip(X.columns, f_scores):
    print(f"{feature}: {score:.4f}")

## 1️⃣5️⃣ Random Forest Feature Importance

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
for feature, score in zip(X.columns, model.feature_importances_):
    print(f"{feature}: {score:.4f}")