In [1]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.

import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Sample 'Sex' column from Titanic dataset
data = {'Sex': ['male', 'female', 'female', 'male', 'male']}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Label Encoding
label_encoder = LabelEncoder()
df['Sex_LabelEncoded'] = label_encoder.fit_transform(df['Sex'])

print("\nAfter Label Encoding:")
print(df)

# One-Hot Encoding using pandas get_dummies
df_onehot = pd.get_dummies(df['Sex'], prefix='Sex')

print("\nOne-Hot Encoded Columns:")
print(df_onehot)

# Combine original df with one-hot columns
df = pd.concat([df, df_onehot], axis=1)
print("\nDataFrame with One-Hot Encoding:")
print(df)







# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load data
iris = load_iris()
X = iris.data
feature_names = iris.feature_names
df = pd.DataFrame(X, columns=feature_names)

# Step 1: Standardization
scaler_standard = StandardScaler()
X_standardized = scaler_standard.fit_transform(df)
df_standardized = pd.DataFrame(X_standardized, columns=feature_names)

# Step 2: Min-Max Scaling after Standardization
scaler_minmax = MinMaxScaler()
X_scaled_combined = scaler_minmax.fit_transform(df_standardized)
df_scaled_combined = pd.DataFrame(X_scaled_combined, columns=feature_names)

# Output summary
print("Standardized Data (mean ~ 0, std ~ 1):")
print(df_standardized.describe())

print("\nCombined (Standardized + Min-Max Scaled to [0, 1]):")
print(df_scaled_combined.describe())


# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.

import pandas as pd
import seaborn as sns

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Display original columns
print("Original columns:")
print(df[['sex', 'embarked']].head())

# Handle missing values in 'embarked' (if any)
df['embarked'].fillna('missing', inplace=True)

# Apply One-Hot Encoding to 'sex' and 'embarked'
df_encoded = pd.get_dummies(df, columns=['sex', 'embarked'], drop_first=True)

# Display resulting columns
print("\nOne-Hot Encoded columns:")
print(df_encoded.filter(regex='sex_|embarked_').head())









# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.


import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Check original Pclass values
print("Original 'Pclass' column:")
print(df['pclass'].value_counts().sort_index())

# Convert to DataFrame for encoder compatibility
pclass_df = df[['pclass']]

# Optional: make sure it's treated as categorical
pclass_df['pclass'] = pclass_df['pclass'].astype(str)  # Encoding needs strings

# Define Ordinal Encoder with custom order (optional, but shows intent clearly)
encoder = OrdinalEncoder(categories=[['1', '2', '3']])  # '1' is highest class
pclass_encoded = encoder.fit_transform(pclass_df)

# Add back to DataFrame
df['pclass_encoded'] = pclass_encoded.astype(int)

# Show result
print("\nOrdinal Encoded 'Pclass':")
print(df[['pclass', 'pclass_encoded']].head())








# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.




import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define scalers
scalers = {
    'None': None,
    'Min-Max Scaling': MinMaxScaler(),
    'Standardization': StandardScaler()
}

# Store results
results = []

for scale_name, scaler in scalers.items():
    if scaler is not None:
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled, X_test_scaled = X_train, X_test

    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train_scaled, y_train)
    dt_acc = accuracy_score(y_test, dt.predict(X_test_scaled))

    # SVM
    svm = SVC(random_state=42)
    svm.fit(X_train_scaled, y_train)
    svm_acc = accuracy_score(y_test, svm.predict(X_test_scaled))

    results.append((scale_name, dt_acc, svm_acc))

# Display results
df_results = pd.DataFrame(results, columns=['Scaling Technique', 'Decision Tree Accuracy', 'SVM Accuracy'])
print(df_results)






# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.


import pandas as pd

# Sample data with high cardinality categorical column
df = pd.DataFrame({
    'user_id': ['U1', 'U2', 'U3', 'U2', 'U1', 'U4', 'U2', 'U5', 'U3', 'U1'],
    'purchase_amount': [100, 150, 200, 130, 180, 90, 160, 110, 170, 120]
})

# Custom frequency encoding function
def frequency_encode(column):
    freq = column.value_counts() / len(column)
    return column.map(freq)

# Apply transformation
df['user_id_encoded'] = frequency_encode(df['user_id'])

# Display result
print(df)



Original Data:
      Sex
0    male
1  female
2  female
3    male
4    male

After Label Encoding:
      Sex  Sex_LabelEncoded
0    male                 1
1  female                 0
2  female                 0
3    male                 1
4    male                 1

One-Hot Encoded Columns:
   Sex_female  Sex_male
0       False      True
1        True     False
2        True     False
3       False      True
4       False      True

DataFrame with One-Hot Encoding:
      Sex  Sex_LabelEncoded  Sex_female  Sex_male
0    male                 1       False      True
1  female                 0        True     False
2  female                 0        True     False
3    male                 1       False      True
4    male                 1       False      True
Standardized Data (mean ~ 0, std ~ 1):
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count       1.500000e+02      1.500000e+02       1.500000e+02   
mean       -1.468455e-15     -1.823726e-15      -1.610564e-15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pclass_df['pclass'] = pclass_df['pclass'].astype(str)  # Encoding needs strings
