In [None]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load Titanic dataset from local CSV
titanic = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/Hands-on - Data Quality Assessment & Profiling/titanic.csv')

# Drop missing values in 'sex' column
sex_feature = titanic[['Sex']].dropna()

# Label Encoding
label_enc = LabelEncoder()
sex_label_encoded = label_enc.fit_transform(sex_feature['Sex'])

# One-Hot Encoding
onehot_enc = OneHotEncoder(sparse_output=False)
sex_onehot_encoded = onehot_enc.fit_transform(sex_feature)

# Display
print("Label Encoded Values:\n", pd.Series(sex_label_encoded).value_counts())
print("\nOne-Hot Encoded DataFrame:\n", pd.DataFrame(sex_onehot_encoded, columns=onehot_enc.get_feature_names_out(['Sex'])).head())


Label Encoded Values:
 1    577
0    314
Name: count, dtype: int64

One-Hot Encoded DataFrame:
    Sex_female  Sex_male
0         0.0       1.0
1         1.0       0.0
2         1.0       0.0
3         1.0       0.0
4         0.0       1.0


In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Select numeric columns from Titanic dataset
numeric_data = titanic.select_dtypes(include='number').dropna()

# Apply Min-Max Scaling
minmax = MinMaxScaler()
data_minmax = minmax.fit_transform(numeric_data)

# Apply Standardization
standard = StandardScaler()
data_scaled = standard.fit_transform(data_minmax)

# Result: Now data has mean ≈ 0 and std ≈ 1
print(pd.DataFrame(data_scaled).describe())


                  0             1             2             3             4  \
count  7.140000e+02  7.140000e+02  7.140000e+02  7.140000e+02  7.140000e+02   
mean   1.592253e-16  1.243947e-17 -5.473368e-17 -1.492737e-17  9.454000e-17   
std    1.000701e+00  1.000701e+00  1.000701e+00  1.000701e+00  1.000701e+00   
min   -1.728532e+00 -8.270201e-01 -1.476364e+00 -2.016979e+00 -5.517031e-01   
25%   -8.740804e-01 -8.270201e-01 -1.476364e+00 -6.595416e-01 -5.517031e-01   
50%   -1.383587e-02 -8.270201e-01 -2.825656e-01 -1.170488e-01 -5.517031e-01   
75%    8.850279e-01  1.209160e+00  9.112324e-01  5.718310e-01  5.245701e-01   
max    1.708584e+00  1.209160e+00  9.112324e-01  3.465126e+00  4.829663e+00   

                  5             6  
count  7.140000e+02  7.140000e+02  
mean  -1.990316e-17  4.975789e-18  
std    1.000701e+00  1.000701e+00  
min   -5.058951e-01 -6.560759e-01  
25%   -5.058951e-01 -5.038498e-01  
50%   -5.058951e-01 -3.583992e-01  
75%    6.668618e-01 -2.495211e-02  


In [11]:
cat_features = titanic[['Sex', 'Embarked']].dropna()

# One-Hot Encode
onehot = OneHotEncoder(sparse_output=False, drop='first')  # Drop first to avoid dummy variable trap
encoded_cat = onehot.fit_transform(cat_features)

# Get feature names
encoded_df = pd.DataFrame(encoded_cat, columns=onehot.get_feature_names_out(cat_features.columns))
print(encoded_df.head())


   Sex_male  Embarked_Q  Embarked_S
0       1.0         0.0         1.0
1       0.0         0.0         0.0
2       0.0         0.0         1.0
3       0.0         0.0         1.0
4       1.0         0.0         1.0


In [13]:
from sklearn.preprocessing import OrdinalEncoder

# Only non-null values
ordinal_data = titanic[['Pclass']].dropna()

# Ordinal Encoding
ordinal_enc = OrdinalEncoder()
encoded_pclass = ordinal_enc.fit_transform(ordinal_data)

# Attach to DataFrame
titanic['Pclass_encoded'] = encoded_pclass
print(titanic[['Pclass', 'Pclass_encoded']].dropna().head())


   Pclass  Pclass_encoded
0       3             2.0
1       1             0.0
2       3             2.0
3       1             0.0
4       3             2.0


In [15]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Cleaned dataset
titanic_clean = titanic[['Pclass', 'Age', 'Fare', 'Sex', 'Survived']].dropna()
X = pd.get_dummies(titanic_clean.drop('Survived', axis=1), drop_first=True)
y = titanic_clean['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Without scaling
tree = DecisionTreeClassifier().fit(X_train, y_train)
svm_unscaled = SVC().fit(X_train, y_train)

# With Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_scaled = SVC().fit(X_train_scaled, y_train)

print(f"Decision Tree (unscaled): {tree.score(X_test, y_test):.4f}")
print(f"SVM (unscaled): {svm_unscaled.score(X_test, y_test):.4f}")
print(f"SVM (scaled): {svm_scaled.score(X_test_scaled, y_test):.4f}")


Decision Tree (unscaled): 0.7374
SVM (unscaled): 0.6425
SVM (scaled): 0.7654


In [18]:
def frequency_encode(df, column):
    freq_map = df[column].value_counts(normalize=True)
    return df[column].map(freq_map)

# Simulate high-cardinality with 'deck'
titanic['Pclass_encoded'] = frequency_encode(titanic.fillna({'Pclass': 'Unknown'}), 'Pclass')
print(titanic[['Pclass', 'Pclass_encoded']].head())


   Pclass  Pclass_encoded
0       3        0.551066
1       1        0.242424
2       3        0.551066
3       1        0.242424
4       3        0.551066


In [5]:
print(titanic.columns)



Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
