# **Encoding Titanic Dataset**



In [24]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [25]:
# Step 2: Load the Dataset
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
df = pd.read_csv(url)

In [26]:
# Step 3: Display Dataset Info
print(df.head())  # View first few rows
print(df.info())  # Check data types

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8

In [27]:
# Step 4: Select Categorical Features for Encoding
X = df[['Pclass', 'Sex']].copy()  # Only selecting categorical columns

In [28]:
# Step 5: Apply One-Hot Encoding (for independent categorical variables)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Pclass', 'Sex'])], remainder='passthrough')
X_encoded = ct.fit_transform(X)  # Transform data

In [29]:
# Step 6: Apply Label Encoding (for a single categorical variable)
le = LabelEncoder()
df['Sex_LabelEncoded'] = le.fit_transform(df['Sex'])  # 'male' -> 1, 'female' -> 0

In [30]:
# Step 7: Display Encoded Data
print("\nOne-Hot Encoded Independent Variables:")
print(pd.DataFrame(X_encoded).head())  # Convert to DataFrame for better readability

print("\nLabel Encoded 'Sex' Column:")
print(df[['Sex', 'Sex_LabelEncoded']].head())  # Display first 5 rows


One-Hot Encoded Independent Variables:
     0    1    2    3    4
0  0.0  0.0  1.0  0.0  1.0
1  1.0  0.0  0.0  1.0  0.0
2  0.0  0.0  1.0  1.0  0.0
3  1.0  0.0  0.0  1.0  0.0
4  0.0  0.0  1.0  0.0  1.0

Label Encoded 'Sex' Column:
      Sex  Sex_LabelEncoded
0    male                 1
1  female                 0
2  female                 0
3  female                 0
4    male                 1
