In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load Titanic dataset (using seaborn for easy access)
import seaborn as sns
titanic = sns.load_dataset('titanic')

# Select subset and drop rows with missing values for simplicity
titanic = titanic[['sex', 'embarked', 'pclass', 'fare', 'age', 'survived']].dropna()

# Features and target
X = titanic.drop('survived', axis=1)
y = titanic['survived']

# ---------------------------------------------
# Question 5: Label Encoding vs One-Hot Encoding on 'sex'
print("\nQuestion 5: Label Encoding vs One-Hot Encoding on 'sex'")

# Label Encoding (converts categories to integers)
le = LabelEncoder()
X_le = X.copy()
X_le['sex_encoded'] = le.fit_transform(X_le['sex'])
print("Label Encoded 'sex' values:")
print(X_le[['sex', 'sex_encoded']].head())

# One-Hot Encoding (creates separate binary columns)
ohe = OneHotEncoder(sparse=False, drop='first')  # drop first to avoid dummy variable trap
X_ohe = X.copy()
sex_ohe = ohe.fit_transform(X_ohe[['sex']])
sex_ohe_df = pd.DataFrame(sex_ohe, columns=ohe.get_feature_names_out(['sex']), index=X_ohe.index)
X_ohe = pd.concat([X_ohe, sex_ohe_df], axis=1)
print("\nOne-Hot Encoded 'sex' columns:")
print(X_ohe[['sex', 'sex_male']].head())  # 'sex_male' is one-hot encoded column

# ---------------------------------------------
# Question 6: Combining Min-Max Scaling and Standardization
print("\nQuestion 6: Combining Min-Max Scaling and Standardization")

# Example: First Min-Max scaling, then Standardization on 'fare' and 'age'

# Step 1: Min-Max scaling
minmax = MinMaxScaler()
X_scaled_minmax = X[['fare', 'age']].copy()
X_scaled_minmax = pd.DataFrame(minmax.fit_transform(X_scaled_minmax), columns=['fare', 'age'])

# Step 2: Standardization of Min-Max scaled data
std = StandardScaler()
X_scaled_combined = pd.DataFrame(std.fit_transform(X_scaled_minmax), columns=['fare', 'age'])

print("Original 'fare' and 'age' stats:")
print(X[['fare', 'age']].describe().loc[['min', 'max', 'mean', 'std']])
print("\nAfter Min-Max Scaling:")
print(X_scaled_minmax.describe().loc[['min', 'max', 'mean', 'std']])
print("\nAfter Standardization of Min-Max scaled data:")
print(X_scaled_combined.describe().loc[['min', 'max', 'mean', 'std']])

# ---------------------------------------------
# Question 7: Handling multiple categorical features with One-Hot Encoding ('sex', 'embarked')
print("\nQuestion 7: One-Hot Encoding 'sex' and 'embarked'")

# Use pandas get_dummies for simplicity, drop first to avoid dummy variable trap
X_ohe_multi = pd.get_dummies(X, columns=['sex', 'embarked'], drop_first=True)
print(X_ohe_multi.head())

# ---------------------------------------------
# Question 8: Ordinal Encoding for 'pclass' (Passenger class)
print("\nQuestion 8: Ordinal Encoding of 'pclass'")

# Since 'pclass' is already ordinal with ranks 1 (first) < 2 (second) < 3 (third),
# map it explicitly or use OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=[[1, 2, 3]])
X_ordinal = X.copy()
X_ordinal['pclass_encoded'] = ordinal_encoder.fit_transform(X_ordinal[['pclass']])
print(X_ordinal[['pclass', 'pclass_encoded']].head())

# ---------------------------------------------
# Question 9: Impact of Scaling on Decision Tree vs SVM

print("\nQuestion 9: Impact of Scaling on Decision Tree vs SVM")

# Split data for modeling
X_model = X_ohe_multi.copy()  # Use one-hot encoded features for simplicity
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.3, random_state=42, stratify=y)

# Decision Tree (tree-based models are NOT sensitive to feature scaling)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(f"Decision Tree Accuracy (no scaling needed): {accuracy_score(y_test, dt_pred):.4f}")

# SVM WITHOUT scaling
svm_no_scale = SVC(random_state=42)
svm_no_scale.fit(X_train, y_train)
svm_pred_no_scale = svm_no_scale.predict(X_test)
print(f"SVM Accuracy without scaling: {accuracy_score(y_test, svm_pred_no_scale):.4f}")

# SVM WITH scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_scaled = SVC(random_state=42)
svm_scaled.fit(X_train_scaled, y_train)
svm_pred_scaled = svm_scaled.predict(X_test_scaled)
print(f"SVM Accuracy with Standard Scaling: {accuracy_score(y_test, svm_pred_scaled):.4f}")

# ---------------------------------------------
# Question 10: Custom Transformation Function for High Cardinality Categorical Features

print("\nQuestion 10: Custom Encoding for High Cardinality Features")

# Example: Frequency encoding for a categorical feature (simulate with 'embarked')

def frequency_encoding(series):
    freq = series.value_counts(normalize=True)
    return series.map(freq)

# Apply frequency encoding to 'embarked'
X_freq_encoded = X.copy()
X_freq_encoded['embarked_freq'] = frequency_encoding(X_freq_encoded['embarked'])
print(X_freq_encoded[['embarked', 'embarked_freq']].head())

# Frequency encoding helps preserve information with fewer dimensions compared to one-hot encoding



Question 5: Label Encoding vs One-Hot Encoding on 'sex'
Label Encoded 'sex' values:
      sex  sex_encoded
0    male            1
1  female            0
2  female            0
3  female            0
4    male            1


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'