In [None]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load Titanic dataset (using seaborn for convenience)
import seaborn as sns
df = sns.load_dataset('titanic').dropna(subset=['sex', 'pclass', 'embarked'])  # Drop rows with missing key categorical data for simplicity

print("Original data sample:")
print(df[['sex', 'pclass', 'embarked']].head())

# --- Question 5: Label Encoding vs One-Hot Encoding on 'sex' ---
print("\n--- Question 5: Label Encoding vs One-Hot Encoding ---")
label_enc = LabelEncoder()
df['sex_label'] = label_enc.fit_transform(df['sex'])

# OneHotEncoder fix: use drop=None to get both columns
onehot_enc = OneHotEncoder(sparse_output=False, drop=None)
sex_onehot = onehot_enc.fit_transform(df[['sex']])
df['sex_onehot_female'] = sex_onehot[:,0]
df['sex_onehot_male'] = sex_onehot[:,1]

print(df[['sex', 'sex_label', 'sex_onehot_female', 'sex_onehot_male']].head())

# --- Question 6: Combining Min-Max Scaling and Standardization ---
print("\n--- Question 6: Combining Min-Max Scaling and Standardization ---")
num_features = ['age', 'fare']
# Fill missing numerical data for scaling demonstration
df[num_features] = df[num_features].fillna(df[num_features].mean())

scaler_minmax = MinMaxScaler()
df_minmax = scaler_minmax.fit_transform(df[num_features])

scaler_standard = StandardScaler()
df_standard = scaler_standard.fit_transform(df_minmax)  # Standardize the min-max scaled data

print("First 5 rows after Min-Max scaling and Standardization:")
print(df_standard[:5])

# --- Question 7: Handling multiple categorical features with One-Hot Encoding ---
print("\n--- Question 7: One-Hot Encoding 'sex' and 'embarked' ---")
cat_features = ['sex', 'embarked']
onehot_enc_multi = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = onehot_enc_multi.fit_transform(df[cat_features])
encoded_df = pd.DataFrame(encoded_features, columns=onehot_enc_multi.get_feature_names_out(cat_features))
df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)
print(df[cat_features + list(encoded_df.columns)].head())

# --- Question 8: Ordinal Encoding for 'pclass' (ranked categories) ---
print("\n--- Question 8: Ordinal Encoding for 'pclass' ---")
# Correct order from high class(1) to low class(3)
ordinal_encoder = OrdinalEncoder(categories=[[1, 2, 3]])
df['pclass_encoded'] = ordinal_encoder.fit_transform(df[['pclass']])
print(df[['pclass', 'pclass_encoded']].head())

# --- Question 9: Impact of Scaling on Decision Tree vs SVM ---
print("\n--- Question 9: Impact of Scaling on Decision Tree vs SVM ---")
# Prepare features and target
features = ['age', 'fare', 'pclass_encoded'] + list(encoded_df.columns)
X = df[features].fillna(0)
y = df['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Decision Tree (no scaling needed)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# SVM with StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
acc_svm = accuracy_score(y_test, y_pred_svm)

print(f"Decision Tree accuracy (no scaling): {acc_dt:.3f}")
print(f"SVM accuracy (with standard scaling): {acc_svm:.3f}")

# --- Question 10: Custom transformation for high cardinality categorical features ---
print("\n--- Question 10: Custom encoding for high cardinality categorical features ---")

def custom_high_cardinality_encoder(series, top_n=5):
    """Encode top_n frequent categories and group others into 'Other'."""
    top_categories = series.value_counts().nlargest(top_n).index
    return series.apply(lambda x: x if x in top_categories else 'Other')

# Example with 'embarked' (not high cardinality but for demo)
df['embarked_custom_encoded'] = custom_high_cardinality_encoder(df['embarked'], top_n=2)
print(df[['embarked', 'embarked_custom_encoded']].head())

# One-hot encode the custom encoded feature
onehot_enc_custom = OneHotEncoder(sparse_output=False, drop=None)
custom_encoded = onehot_enc_custom.fit_transform(df[['embarked_custom_encoded']])
custom_encoded_df = pd.DataFrame(custom_encoded, columns=onehot_enc_custom.get_feature_names_out(['embarked_custom_encoded']))
df = pd.concat([df.reset_index(drop=True), custom_encoded_df], axis=1)

print(df[['embarked_custom_encoded'] + list(custom_encoded_df.columns)].head())


Original data sample:
      sex  pclass embarked
0    male       3        S
1  female       1        C
2  female       3        S
3  female       1        S
4    male       3        S

--- Question 5: Label Encoding vs One-Hot Encoding ---
      sex  sex_label  sex_onehot_female  sex_onehot_male
0    male          1                0.0              1.0
1  female          0                1.0              0.0
2  female          0                1.0              0.0
3  female          0                1.0              0.0
4    male          1                0.0              1.0

--- Question 6: Combining Min-Max Scaling and Standardization ---
First 5 rows after Min-Max scaling and Standardization:
[[-0.58961986 -0.50023975]
 [ 0.64484799  0.78894661]
 [-0.28100289 -0.48664993]
 [ 0.41338527  0.42286111]
 [ 0.41338527 -0.4841333 ]]

--- Question 7: One-Hot Encoding 'sex' and 'embarked' ---
      sex embarked  sex_male  embarked_Q  embarked_S
0    male        S       1.0         0.0       