In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install category_encoders

import category_encoders as ce

In [None]:
from sklearn.preprocessing import (MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, QuantileTransformer,
    LabelEncoder, OneHotEncoder, OrdinalEncoder)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# 2. Load Dataset
titanic = sns.load_dataset('titanic')

# Select features & target
df = titanic[['class', 'sex', 'age', 'fare', 'embarked', 'survived']].copy()

# Introduce some missing values for demo (already present in Titanic)
df.head()


In [None]:
df = df[["age", "fare", "sex", "class", "embarked",  "survived"]].dropna()


In [None]:
# 3. Basic Data Info
print(df.info())

In [None]:
print(df.describe(include="all").T)

info() shows datatypes (which helps us know which features are categorical/numeric).

describe() summarizes statistics.

In [None]:

df.dropna(inplace=True)  # drop rows with missing values
df.head()


# **Scaling (Normalization & Standardization)**

#  Feature Scaling: Normalization vs. Standardization

Feature scaling is an essential preprocessing step in machine learning.  
Many algorithms (Linear Regression, Logistic Regression, SVMs, PCA, K-means, Neural Networks) rely on **distances, dot-products, or gradient descent**. If features are on very different scales, the model may become biased toward larger-magnitude features.

---

## Normalization (Min–Max Scaling)

**Definition:**  
Rescales values of a feature into a fixed range, usually [0, 1].  

\[$
x' = \frac{x - x_{\min}}{x_{\max} - x_{\min}}$
\]

- Each feature’s smallest value becomes 0, largest becomes 1.  
- Preserves the shape of the distribution, but **compresses** the scale.  
- Very sensitive to **outliers** (a single extreme value can stretch the scale).  

**When to use:**  
- Works well when features are bounded (e.g., pixel intensities between 0–255).  
- Useful in **Neural Networks** where inputs are often normalized.  
- Less suitable for features with heavy outliers.

---

## Standardization (Z-score Scaling)

**Definition:**  
Centers the feature at mean 0 and scales it to unit variance.  

\[$
x' = \frac{x - \mu}{\sigma}$
\]

where \(\mu\) is the feature mean and \(\sigma\) is the standard deviation.  

- The transformed feature has mean ≈ 0 and standard deviation ≈ 1.  
- Not limited to [0, 1]; values can be negative or greater than 1.  
- More **robust** than normalization when dealing with outliers.  
- Assumes features are roughly Gaussian (bell-shaped) for best results.  

**When to use:**  
- Default choice for **Linear/Logistic Regression, SVMs, PCA, and K-means**.  
- Works well when features have different units (e.g., age in years, fare in dollars).  
- Preserves outlier influence without compressing them to 0–1.

---

##  Choosing Between Normalization and Standardization

- If your model relies on **distance or dot-product geometry** (linear regression, SVM, PCA, neural networks) → **Standardization** is often better.  
- If you need features in a **bounded range** (e.g., image intensities, probability inputs) → **Normalization** works well.  
- With **tree-based models** (Decision Trees, Random Forests, XGBoost, CatBoost), scaling usually **doesn’t matter** because splits are not distance-based.  

---

##  Titanic Example (numeric features only)

In the Titanic dataset, consider two features:  
- **`age`** (in years, typically 0–80 but with some missing values).  
- **`fare`** (in dollars, highly skewed with very large outliers).  

- With **Normalization**, `fare` outliers (very expensive tickets) will squeeze the majority of passenger fares into a very narrow [0–0.1] range.  
- With **Standardization**, `fare` will be centered and scaled, but outliers will appear as large positive z-scores, which keeps more useful variation for linear models.  

---

**Summary:**  
- **Normalization:** good when bounded [0,1] values are needed, but sensitive to outliers.  
- **Standardization:** good for most ML algorithms, especially linear ones; robust to varying feature units.  


In [None]:
scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[["age", "fare"]] = scaler.fit_transform(df[["age", "fare"]])

print(df_minmax.head())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df["fare"].plot(kind="hist", bins=30, ax=axes[0], title="Original Fare")
df_minmax["fare"].plot(kind="hist", bins=30, ax=axes[1], title="MinMax Normalized Fare", color="orange")
plt.show()
# Rescales values between 0 and 1. Useful when features have different ranges.


In [None]:
scaler = StandardScaler()
df_standard = df.copy()
df_standard[["age", "fare"]] = scaler.fit_transform(df[["age", "fare"]])

print(df_standard.head())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df["fare"].plot(kind="hist", bins=30, ax=axes[0], title="Original Fare")
df_standard["fare"].plot(kind="hist", bins=30, ax=axes[1], title="Standardized Fare", color="green")
plt.show()
# Transforms to mean=0, std=1. Best for models assuming Gaussian-like features.

##  QuantileTransformer

**What it does:**  
- Transforms features to follow a **target distribution** by mapping quantiles.  
- Two main options in scikit-learn:  
  - `output_distribution="uniform"` → transforms features into [0, 1] uniform distribution.  
  - `output_distribution="normal"` → transforms features into standard normal (mean=0, var=1).  

**How it works:**  
- Computes the empirical CDF (cumulative distribution function) of the data.  
- Maps each value to its quantile, then to the chosen output distribution.  

**Properties:**  
- Makes distributions **more Gaussian-like** (if `normal`).  
- Very robust to **outliers** (they get compressed into the tails).  
- Non-linear transformation → changes relationships between features.  

**When to use:**  
- Features are highly skewed, long-tailed, or non-Gaussian.  
- Useful before algorithms that are sensitive to non-normality (e.g., linear models, Gaussian-based methods).  
- Works well when feature scales are very irregular.  

---

##  StandardScaler vs. QuantileTransformer

| Aspect | StandardScaler | QuantileTransformer |
|--------|----------------|----------------------|
| Effect on Mean/Variance | Centers to mean 0, var 1 | Shapes data into uniform [0,1] or normal |
| Handles Outliers | No (keeps them extreme) | Yes (compresses them into tails) |
| Distribution Shape | Preserved (still skewed if original is skewed) | Changed (forces uniform or normal) |
| Use Case | Roughly Gaussian data, not too skewed | Highly skewed, heavy-tailed, or irregular data |

---

##  Titanic Example (numeric feature: `fare`)

- **StandardScaler:**  
  - `fare` will be centered at 0, variance = 1.  
  - Extremely high fares remain large positive z-scores.  
- **QuantileTransformer (normal):**  
  - `fare` distribution becomes closer to Gaussian.  
  - Very expensive tickets are pushed into the far right tail but less extreme.  

---

 **Summary:**  
- Use **StandardScaler** when features are moderately well-behaved (close to Gaussian).  
- Use **QuantileTransformer** when features are **skewed** or have **outliers**, and you want a Gaussian or uniform output distribution.  

In [None]:
scaler = QuantileTransformer(output_distribution="uniform")
df_quantile_uniform = df.copy()
df_quantile_uniform[["age", "fare"]] = scaler.fit_transform(df[["age", "fare"]])

print(df_quantile_uniform.head())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df["fare"].plot(kind="hist", bins=30, ax=axes[0], title="Original Fare")
df_quantile_uniform["fare"].plot(kind="hist", bins=30, ax=axes[1], title="Quantile Uniform Fare", color="brown")
plt.show()
# Maps values → uniform [0,1] distribution.

In [None]:
scaler = QuantileTransformer(output_distribution="normal")
df_quantile_normal = df.copy()
df_quantile_normal[["age", "fare"]] = scaler.fit_transform(df[["age", "fare"]])

print(df_quantile_normal.head())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df["fare"].plot(kind="hist", bins=30, ax=axes[0], title="Original Fare")
df_quantile_normal["fare"].plot(kind="hist", bins=30, ax=axes[1], title="Quantile Normal Fare", color="cyan")
plt.show()
# Forces values to follow a normal distribution.

# **Encoding**

#  Why Do We Need Encoding?

Machine learning models usually work with **numerical features**.  
But in many datasets (like Titanic), we have **categorical features** such as:

- `sex` → {male, female}  
- `embarked` → {C, Q, S}  
- `class` → {First, Second, Third}  

These values are **labels**, not numbers.  
If we feed them directly into most ML algorithms, they will not understand the categories.  

 **Encoding** is the process of converting categorical variables into numeric form so that models can use them.

---

##  Label Encoding

**How it works:**  
- Assigns each category a unique integer.  

Example:  
- `sex`: {male → 0, female → 1}  
- `embarked`: {C → 0, Q → 1, S → 2}  

**Advantages:**  
- Simple and compact.  
- Works well for **tree-based models** (Decision Trees, Random Forest, XGBoost), since they split based on thresholds, not distances.  

**Disadvantages:**  
- Imposes a **fake order** (0 < 1 < 2), which can mislead models like Linear Regression, Logistic Regression, or SVM (they may think `S` > `Q` > `C`).  

---

##  One-Hot Encoding (OHE)

**How it works:**  
- Creates a new **binary column for each category**.  
- Value is `1` if the row belongs to that category, else `0`.  

Example: `embarked` → {C, Q, S}  

| embarked | embarked_C | embarked_Q | embarked_S |
|----------|------------|------------|------------|
| C        | 1          | 0          | 0          |
| S        | 0          | 0          | 1          |
| Q        | 0          | 1          | 0          |

**Advantages:**  
- Avoids false ordinal relationships.  
- Safe default for **linear models, logistic regression, SVMs, and neural networks**.  

**Disadvantages:**  
- Increases the number of features (can be large if categories are many).  
- Can lead to **sparse matrices** with high-cardinality categorical variables.  

---

## When to Use Which?

- **Tree-based models (Decision Trees, Random Forest, Gradient Boosting):**  
  → Label Encoding is fine (trees handle categories by splitting, not by distances).  

- **Linear models, Logistic Regression, SVMs, Neural Networks:**  
  → One-Hot Encoding is preferred (no fake ordering).  

- **High cardinality (hundreds/thousands of categories):**  
  → Consider advanced techniques like **Target Encoding, Frequency Encoding, or Hashing**.  

---

**Summary:**  
- **Label Encoding** → compact, but risky for linear models.  
- **One-Hot Encoding** → more features, but safer for most models.  
- Always choose encoding based on **model type** and **data characteristics**.  


In [None]:
categorical_features = ["sex", "embarked","class"]

# 1. Label Encoding
le = LabelEncoder()
df_label = df.copy()
for col in categorical_features:
    df_label[col] = le.fit_transform(df[col])
print("Label Encoding:\n", df_label, "\n")

In [None]:
# 2. One-Hot Encoding
df_onehot = pd.get_dummies(df, columns=categorical_features, drop_first=True)
print("One-Hot Encoding:\n", df_onehot.head(), "\n")

# 🎛️ Beyond One-Hot and Label Encoding: Advanced Encodings


##  Ordinal Encoding

**What it does:**  
- Maps categories to integers **preserving their natural order**.  

Example (Titanic `class`):  
- First → 3  
- Second → 2  
- Third → 1  

**Why:**  
- `class` is truly ordered (First > Second > Third).  
- Unlike generic Label Encoding, the ordering here makes sense.  

**Use case:**  
- Works well for features with meaningful hierarchy (education level, size: small/medium/large, etc.).  
- Safe for linear models if the relationship is monotonic.

---



In [None]:
# 3. Ordinal Encoding (for ordered categories like 'class')
ord_enc = OrdinalEncoder(categories=[["Third", "Second", "First"]])
df_ord = df.copy()
df_ord["class"] = ord_enc.fit_transform(df[["class"]])
print("Ordinal Encoding:\n", df_ord[["class"]], "\n")
# Ordinal Encoding (for ordered categories like class = [Third < Second < First])

## ✅ Target Encoding

**What it does:**  
- Replaces each category with a **statistic of the target variable** (commonly the mean).  

Example (Titanic `sex` with target = `survived`):  
- Female → survival rate ≈ 0.74  
- Male → survival rate ≈ 0.19  

So `sex` becomes:  
- Female → 0.74  
- Male → 0.19  

**Why:**  
- Collapses categorical values into **one numeric feature**.  
- Very powerful for **high-cardinality features** (like hundreds of ZIP codes).  

---


In [None]:
# 4. Target Encoding (Mean of survived per category)
encoder_target = ce.TargetEncoder(cols=["embarked"])
df_target = encoder_target.fit_transform(df[["embarked"]], df["survived"])

print("After Target Encoding (Embarked):")
print(df_target)
# Replace categories with mean of target (survived).


## ✅ Binary Encoding

**What it does:**  
- Converts category indices into **binary digits** and places them across new columns.  
- A balance between Label and One-Hot: fewer columns than OHE, but no fake ordering.  

Example (imagine `embarked` has categories C=0, Q=1, S=2):  
- 0 → 00  
- 1 → 01  
- 2 → 10  

So we get two binary columns:  

| embarked | bin1 | bin2 |
|----------|------|------|
| C (0)    |  0   |  0   |
| Q (1)    |  0   |  1   |
| S (2)    |  1   |  0   |

**Use case:**  
- Good for **moderate or high-cardinality** features where OHE would explode in dimensionality.  
- Still keeps categories separated better than plain Label Encoding.

---


 **Key Takeaway:**  
- Choose encoding by **nature of the feature** and **model type**.  
- Use Ordinal for true orders, One-Hot for nominal categories in linear models, Target/Binary for high-cardinality, Helmert for statistical interpretability.


In [None]:
# 5. Binary Encoding
encoder = ce.BinaryEncoder(cols=["embarked"])
df_binary = encoder.fit_transform(df[["embarked"]])

print("Original values:\n", df["embarked"].unique())
print("\nAfter Binary Encoding:\n", df_binary)
# Binary Encoding (convert categories into binary digits → fewer columns than one-hot)


##  Summary Table

| Encoding Type   | Best For | Pros | Cons |
|-----------------|----------|------|------|
| **Ordinal**     | Ordered categories (class, size) | Simple, preserves hierarchy | Wrong if order is artificial |
| **Target**      | High-cardinality categorical | Compresses info, powerful | Risk of leakage, must use CV |
| **Binary**      | Medium/high-cardinality | Fewer features than OHE | Less interpretable |


---

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load Titanic dataset
titanic = sns.load_dataset('titanic').dropna(subset=['fare'])
X = titanic[['pclass','sex','age','sibsp','parch','embarked']]
y = titanic['fare']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Columns
num_features = ['pclass','age','sibsp','parch']
cat_features = ['sex','embarked']

def build_pipe(scaler, encoder):
    num_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('scale', scaler)
    ])
    cat_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('encode', encoder)
    ])
    pre = ColumnTransformer([
        ('num', num_pipe, num_features),
        ('cat', cat_pipe, cat_features)
    ])
    return Pipeline([('pre', pre), ('lr', LinearRegression())])

# Pipelines to compare
pipelines = {
    "Normalization + OHE": build_pipe(MinMaxScaler(), OneHotEncoder(drop='first', handle_unknown='ignore')),
    "Standardization + OHE": build_pipe(StandardScaler(), OneHotEncoder(drop='first', handle_unknown='ignore')),
    "Standardization + Label": build_pipe(StandardScaler(), OrdinalEncoder()),
}

# Fit, predict, and compare
results = []
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds) ** 0.5
    r2 = r2_score(y_test, preds)
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})

df_results = pd.DataFrame(results).sort_values(by="RMSE")
print(df_results.to_string(index=False))
