# 🧼 Data Preprocessing Toolbox with Examples
This notebook contains key data preprocessing techniques every advanced data scientist should know — with practical Python examples.

## 📦 1. Missing Value Handling

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

# Example dataset
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 2, 3, 4]
})

# Simple imputation
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# KNN imputation
knn_imputer = KNNImputer(n_neighbors=2)
df_knn = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

df_knn

## 📏 2. Feature Scaling & Transformation

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Example dataset
X = pd.DataFrame({'x1': [1, 10, 100, 1000], 'x2': [0.1, 0.5, 0.9, 0.95]})

# Standard scaling
standard = StandardScaler().fit_transform(X)

# MinMax scaling
minmax = MinMaxScaler().fit_transform(X)

# Robust scaling
robust = RobustScaler().fit_transform(X)

pd.DataFrame(robust, columns=['x1', 'x2'])

## 🏷️ 3. Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

df_cat = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red']})

# One-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(df_cat)
pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

## 🔍 4. Outlier Detection & Removal

In [None]:
from scipy import stats

df_out = pd.DataFrame({'value': [10, 12, 13, 12, 100]})

# Z-score method
z_scores = stats.zscore(df_out)
filtered = df_out[(abs(z_scores) < 2).all(axis=1)]
filtered

## 🔬 5. Multicollinearity Detection with VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import pandas as pd

df_vif = pd.DataFrame({
    'X1': [1, 2, 3, 4, 5],
    'X2': [2, 4, 6, 8, 10],  # Perfectly correlated with X1
    'X3': [5, 3, 6, 2, 1]
})

# VIF calculation
X_scaled = StandardScaler().fit_transform(df_vif)
vif_data = pd.DataFrame()
vif_data['feature'] = df_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]
vif_data

## 🔁 6. Log Transformations

In [None]:
import numpy as np
import pandas as pd

df_log = pd.DataFrame({'income': [500, 1000, 5000, 10000, 20000]})

# Apply log transform
df_log['log_income'] = np.log(df_log['income'])
df_log

## 🎯 7. Feature Selection

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

X, y = load_iris(return_X_y=True)

# Select top 2 features
selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X, y)
X_new[:5]

## 🧬 8. Dimensionality Reduction (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Standardize data
X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Result")
plt.show()

## 🪵 9. Binning / Discretization

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

data = pd.DataFrame({'age': [22, 25, 47, 52, 46, 56]})
binner = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
binned = binner.fit_transform(data)
data['age_bin'] = binned
data

## 🔄 10. Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer

# Load dataset
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Pipeline with scaler + model
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)