In [3]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')

# Keep only relevant columns and drop rows with missing 'embarked' for simplicity
data = titanic[['sex', 'embarked', 'pclass', 'age', 'fare', 'survived']].dropna()

# Separate features and target
X = data.drop('survived', axis=1)
y = data['survived']

print("=== Dataset Sample ===")
print(X.head())

# === Question 5: Label Encoding vs One-Hot Encoding on 'sex' feature ===
print("\n--- Question 5 ---")

# Label Encoding 'sex'
le = LabelEncoder()
X['sex_label_encoded'] = le.fit_transform(X['sex'])
print("Label Encoded 'sex':")
print(X[['sex', 'sex_label_encoded']].head())

# One-Hot Encoding 'sex'
ohe = OneHotEncoder(drop='first', sparse=False)
sex_ohe = ohe.fit_transform(X[['sex']])
sex_ohe_df = pd.DataFrame(sex_ohe, columns=ohe.get_feature_names_out(['sex']))
print("One-Hot Encoded 'sex':")
print(sex_ohe_df.head())

# === Question 6: Combining Min-Max Scaling and Standardization ===
print("\n--- Question 6 ---")

num_features = ['age', 'fare']
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

age_fare_minmax = scaler_minmax.fit_transform(X[num_features])
age_fare_standard = scaler_standard.fit_transform(X[num_features])

print("Min-Max scaled 'age' and 'fare' (first 5 rows):")
print(age_fare_minmax[:5])
print("Standardized 'age' and 'fare' (first 5 rows):")
print(age_fare_standard[:5])

# Explanation:
print("\nMin-Max scaling rescales features between 0 and 1, useful for bounded data.")
print("Standardization centers data to mean=0 and scales to unit variance, useful when data has outliers or varying scales.")

# === Question 7: One-Hot Encoding multiple categorical features ===
print("\n--- Question 7 ---")

cat_features = ['sex', 'embarked']
ohe_multi = OneHotEncoder(drop='first', sparse=False)
cat_encoded = ohe_multi.fit_transform(X[cat_features])
cat_encoded_df = pd.DataFrame(cat_encoded, columns=ohe_multi.get_feature_names_out(cat_features))
print(cat_encoded_df.head())

# === Question 8: Ordinal Encoding for 'pclass' ===
print("\n--- Question 8 ---")

# Passenger class is ranked: 1 (first) > 2 (second) > 3 (third)
ordinal_enc = OrdinalEncoder(categories=[[1, 2, 3]])
X['pclass_ordinal'] = ordinal_enc.fit_transform(X[['pclass']])
print(X[['pclass', 'pclass_ordinal']].head())

# === Question 9: Impact of Scaling on Decision Tree vs SVM ===
print("\n--- Question 9 ---")

# Prepare dataset for modeling
X_model = X.copy()

# For simplicity, encode categorical variables using one-hot (sex, embarked) and ordinal (pclass)
X_model = X_model.drop(['sex'], axis=1)  # drop original sex column
X_model = pd.concat([X_model, cat_encoded_df], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_model, y, random_state=42, test_size=0.3)

# Decision Tree - no scaling required
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(f"Decision Tree accuracy: {accuracy_score(y_test, dt_pred):.4f}")

# SVM without scaling
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(f"SVM accuracy without scaling: {accuracy_score(y_test, svm_pred):.4f}")

# SVM with scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_scaled = SVC(random_state=42)
svm_scaled.fit(X_train_scaled, y_train)
svm_scaled_pred = svm_scaled.predict(X_test_scaled)
print(f"SVM accuracy with Standard Scaling: {accuracy_score(y_test, svm_scaled_pred):.4f}")

# === Question 10: Custom Transformer for High Cardinality Features ===
print("\n--- Question 10 ---")

from sklearn.base import BaseEstimator, TransformerMixin

class TopCategoryEncoder(BaseEstimator, TransformerMixin):
    """
    Encode categorical feature by keeping top N categories, 
    grouping others as 'Other' to reduce cardinality.
    """
    def __init__(self, top_n=3):
        self.top_n = top_n
        self.top_categories_ = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]
        self.top_categories_ = X.value_counts().nlargest(self.top_n).index.tolist()
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]
        X_new = X.apply(lambda x: x if x in self.top_categories_ else 'Other')
        return pd.DataFrame(X_new)

# Example usage with 'embarked' column
top_encoder = TopCategoryEncoder(top_n=2)
top_encoder.fit(X[['embarked']])
X_embarked_reduced = top_encoder.transform(X[['embarked']])
print("Original 'embarked' categories:")
print(X['embarked'].value_counts())
print("\nAfter TopCategoryEncoder (top 2 kept, others grouped as 'Other'):")
print(X_embarked_reduced['embarked'].value_counts())


=== Dataset Sample ===
      sex embarked  pclass   age     fare
0    male        S       3  22.0   7.2500
1  female        C       1  38.0  71.2833
2  female        S       3  26.0   7.9250
3  female        S       1  35.0  53.1000
4    male        S       3  35.0   8.0500

--- Question 5 ---
Label Encoded 'sex':
      sex  sex_label_encoded
0    male                  1
1  female                  0
2  female                  0
3  female                  0
4    male                  1


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'