In [None]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.
# Label Encoding vs One-Hot Encoding on Titanic Dataset
# This script demonstrates the difference between Label Encoding and One-Hot Encoding
# using the 'Sex' feature from the Titanic dataset

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
# Load the Titanic dataset
# For this example, we'll use the seaborn library which has the Titanic dataset built-in
titanic = sns.load_dataset('titanic')
# Display the first few rows of the dataset
print("First 5 rows of the Titanic dataset:")
print(titanic.head())
# Look at the unique values in the 'sex' column
print("\nUnique values in 'sex' column:")
print(titanic['sex'].unique())
# Check the value counts for 'sex'
print("\nValue counts for 'sex':")
print(titanic['sex'].value_counts())
# Create a copy of the dataset to work with
df = titanic.copy()

# LABEL ENCODING
print("\n--- LABEL ENCODING ---")

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit and transform the 'sex' column
df['sex_label_encoded'] = label_encoder.fit_transform(df['sex'])

# Display the first few rows with label encoding
print("\nFirst 5 rows with label encoding:")
print(df[['sex', 'sex_label_encoded']].head())

# Show the mapping between original values and encoded values
print("\nLabel Encoding mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category} -> {i}")

# ONE-HOT ENCODING
print("\n--- ONE-HOT ENCODING ---")

# Method 1: Using pandas get_dummies
df_dummies = pd.get_dummies(df['sex'], prefix='sex')

# Add the one-hot encoded columns to the original dataframe
df = pd.concat([df, df_dummies], axis=1)

# Display the first few rows with one-hot encoding
print("\nFirst 5 rows with one-hot encoding (using pandas get_dummies):")
print(df[['sex', 'sex_female', 'sex_male']].head())

# Method 2: Using scikit-learn OneHotEncoder
print("\nOne-Hot Encoding using scikit-learn:")
# Create a one-hot encoder object
onehot_encoder = OneHotEncoder(sparse_output=False)

# Reshape the data to fit into the encoder
sex_reshaped = df['sex'].values.reshape(-1, 1)

# Fit and transform the data
sex_onehot = onehot_encoder.fit_transform(sex_reshaped)

# Create a DataFrame with the one-hot encoded values
sex_onehot_df = pd.DataFrame(
    sex_onehot, 
    columns=[f"sex_{category}" for category in onehot_encoder.categories_[0]],
    index=df.index
)

# Display the first few rows with one-hot encoding
print("\nFirst 5 rows with one-hot encoding (using scikit-learn):")
print(pd.concat([df['sex'].reset_index(drop=True), sex_onehot_df.reset_index(drop=True)], axis=1).head())

# VISUALIZE THE DIFFERENCES
plt.figure(figsize=(14, 6))

# Plot 1: Label Encoding
plt.subplot(1, 2, 1)
sns.countplot(x='sex', hue='sex_label_encoded', data=df)
plt.title('Label Encoding of Sex Feature')
plt.xlabel('Original Sex Category')
plt.ylabel('Count')
plt.xticks([0, 1], ['female', 'male'])

# Plot 2: One-Hot Encoding
plt.subplot(1, 2, 2)
# Create a temporary dataframe for plotting
temp_df = df.melt(id_vars=['sex'], value_vars=['sex_female', 'sex_male'], 
                  var_name='one_hot_category', value_name='is_category')
temp_df = temp_df[temp_df['is_category'] == 1]  # Only keep rows where the category is present
sns.countplot(x='sex', hue='one_hot_category', data=temp_df)
plt.title('One-Hot Encoding of Sex Feature')
plt.xlabel('Original Sex Category')
plt.ylabel('Count')
plt.xticks([0, 1], ['female', 'male'])

plt.tight_layout()
plt.savefig('encoding_comparison.png')
plt.close()

# COMPARISON SUMMARY
print("\n--- COMPARISON SUMMARY ---")
print("Label Encoding:")
print("- Transforms categorical values into numerical values")
print("- For 'sex' column: female -> 0, male -> 1")
print("- Maintains a single column")
print("- Introduces ordinal relationship (which may not be appropriate for nominal data)")
print("- Memory efficient")

print("\nOne-Hot Encoding:")
print("- Creates a new binary column for each category")
print("- For 'sex' column: creates 'sex_female' and 'sex_male' columns")
print("- Expands to multiple columns (one per category)")
print("- Avoids ordinal relationship, better for nominal data")
print("- Less memory efficient, but more appropriate for machine learning algorithms")

print("\nWhen to use each:")
print("- Label Encoding: Good for ordinal data (e.g., 'low', 'medium', 'high')")
print("- One-Hot Encoding: Better for nominal data with no inherent order (e.g., 'sex', 'country')")
print("- For binary features like 'sex', both work similarly from a mathematical perspective")
print("  but one-hot encoding is generally preferred for consistency and clarity")

# Prepare a neat data comparison for final display
comparison_df = pd.DataFrame({
    'Original': df['sex'].head(10),
    'Label Encoded': df['sex_label_encoded'].head(10),
    'One-Hot (Female)': df['sex_female'].head(10),
    'One-Hot (Male)': df['sex_male'].head(10)
})

print("\nSide-by-side comparison of encodings:")
print(comparison_df)



# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






First 5 rows of the Titanic dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Unique values in 'sex' column:
['male' 'female']

Value counts for 'sex':
sex
male      577
female    314
Name: count, dtype: int64

--- LABEL ENCODING ---

First 5 rows with label 