In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Display first few rows
print(df.head())

# Question 5: Label Encoding vs One-Hot Encoding for 'Sex' feature
print("\n--- Question 5: Label Encoding vs One-Hot Encoding ---")
# Label Encoding
label_encoder = LabelEncoder()
df['Sex_LabelEncoded'] = label_encoder.fit_transform(df['Sex'])

# One-Hot Encoding
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

print("Label Encoded 'Sex':")
print(df[['Sex_LabelEncoded']].head())
print("\nOne-Hot Encoded 'Sex':")
print(df[['Sex_male']].head())

# Question 6: Combining Feature Scaling Techniques
print("\n--- Question 6: Combining Min-Max Scaling and Standardization ---")
X = df[['Age', 'Fare']].fillna(df[['Age', 'Fare']].mean())  # Handle missing values
y = df['Survived']

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)

# Standardization
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)

# Split and demonstrate
X_train_minmax, X_test_minmax, y_train, y_test = train_test_split(X_minmax, y, test_size=0.3, random_state=42)
X_train_std, X_test_std = train_test_split(X_std, test_size=0.3, random_state=42)

print("Scaled Data with Min-Max and Standardization")
print("Min-Max Scaled Data (First 5 rows):", X_minmax[:5])
print("Standardized Data (First 5 rows):", X_std[:5])

# Question 7: Handling Multiple Categorical Features using One-Hot Encoding
print("\n--- Question 7: Handling Multiple Categorical Features ---")
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
print(df[['Embarked_Q', 'Embarked_S']].head())  # 'Embarked_Q' and 'Embarked_S' are the encoded columns

# Question 8: Ordinal Encoding for 'Pclass'
print("\n--- Question 8: Ordinal Encoding for 'Pclass' ---")
ordinal_encoder = OrdinalEncoder(categories=[['3', '2', '1']])
df['Pclass_OrdinalEncoded'] = ordinal_encoder.fit_transform(df[['Pclass']])
print(df[['Pclass', 'Pclass_OrdinalEncoded']].head())

# Question 9: Impact of Scaling on Decision Tree and SVM
print("\n--- Question 9: Impact of Scaling on Decision Tree and SVM ---")
# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_minmax, y_train)
dt_pred = dt_model.predict(X_test_minmax)
dt_accuracy = accuracy_score(y_test, dt_pred)

# SVM Model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_std, y_train)
svm_pred = svm_model.predict(X_test_std)
svm_accuracy = accuracy_score(y_test, svm_pred)

print(f"Decision Tree accuracy with Min-Max Scaling: {dt_accuracy:.4f}")
print(f"SVM accuracy with Standardization: {svm_accuracy:.4f}")

# Question 10: Custom Transformation for High Cardinality Categorical Features
print("\n--- Question 10: Custom Transformation for High Cardinality Categorical Features ---")
def custom_high_cardinality_encoding(df, column_name):
    """
    Custom transformation function for encoding high cardinality categorical features
    """
    freq = df[column_name].value_counts()
    rare_categories = freq[freq < 10].index  # Treat categories with frequency < 10 as 'Other'
    df[column_name] = df[column_name].apply(lambda x: x if x not in rare_categories else 'Other')
    return df

# Apply to 'Ticket' column (which has many unique values)
df = custom_high_cardinality_encoding(df, 'Ticket')
print(df[['Ticket']].head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  

-

ValueError: Unsorted categories are not supported for numerical categories