In [None]:
# Step 1: Install and import libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Step 2: Upload your CSV dataset (Colab specific)
from google.colab import files
uploaded = files.upload()


Saving titanic_sample_dataset.csv to titanic_sample_dataset (2).csv


In [None]:
# Step 3: Load the dataset
df = pd.read_csv('titanic_sample_dataset.csv')
print("✅ Dataset Loaded Successfully")
print(df.head())


✅ Dataset Loaded Successfully
   PassengerId  Survived  Pclass           Name     Sex   Age  SibSp  Parch  \
0            1         0       3       John Doe    male  22.0      1      0   
1            2         1       1     Jane Smith  female  38.0      1      0   
2            3         1       3    Emily Davis  female  26.0      0      0   
3            4         1       1  Michael Brown    male  35.0      1      0   
4            5         0       3   Laura Wilson  female   NaN      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        S  
1          PC 17599  71.2833   C85        C  
2  STON/O2. 3101282   7.9250   NaN        S  
3            113803  53.1000  C123        S  
4            373450   8.0500   NaN        S  


In [None]:
# Step 4: Split features and target
X = df.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'])
y = df['Survived']

In [None]:
# Step 5: Identify column types
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print("\n Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)


 Numerical Columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical Columns: ['Sex', 'Embarked']


In [None]:
# Step 6: Define Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Step 7: Combine transformers in ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [None]:
# Step 8: Apply transformations
X_processed = preprocessor.fit_transform(X)


In [None]:
# Step 9: Output the results
print("\n Data Preprocessing Completed")
print(" Shape of Processed Data:", X_processed.shape)


 Data Preprocessing Completed
 Shape of Processed Data: (5, 9)


In [None]:
# Step 10: Optional - Convert to DataFrame (for viewing)
# NOTE: This requires fetching feature names from OneHotEncoder
import numpy as np

def get_feature_names(preprocessor, numerical_cols, categorical_cols):
    output_features = []
    for name, trans, cols in preprocessor.transformers_:
        if name == 'num':
            output_features.extend(cols)
        elif name == 'cat':
            ohe = trans.named_steps['encoder']
            encoded_names = ohe.get_feature_names_out(categorical_cols)
            output_features.extend(encoded_names)
    return output_features

output_features = get_feature_names(preprocessor, numerical_cols, categorical_cols)
df_processed = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
                            columns=output_features)
print("\n📊 Processed DataFrame Sample:")
print(df_processed.head())


📊 Processed DataFrame Sample:
     Pclass       Age     SibSp  Parch      Fare  Sex_female  Sex_male  \
0  0.816497 -1.420094  0.816497    0.0 -0.816141         0.0       1.0   
1 -1.224745  1.334028  0.816497    0.0  1.530347         1.0       0.0   
2  0.816497 -0.731564 -1.224745    0.0 -0.791405         1.0       0.0   
3 -1.224745  0.817630  0.816497    0.0  0.864024         0.0       1.0   
4  0.816497  0.000000 -1.224745    0.0 -0.786825         1.0       0.0   

   Embarked_C  Embarked_S  
0         0.0         1.0  
1         1.0         0.0  
2         0.0         1.0  
3         0.0         1.0  
4         0.0         1.0  
