# **TASK 2: Perform Data Preprocessing**

In [None]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Step 2: Load the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data.csv')

In [None]:
df.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Sex', 'Age', 'Fare',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Family_size', 'Title_1', 'Title_2',
       'Title_3', 'Title_4', 'Emb_1', 'Emb_2', 'Emb_3'],
      dtype='object')

In [None]:
# Step 3: Select relevant features (independent variables) and the target variable (Survived)
# We're choosing a mix of numeric and categorical features
X = df[["Sex", "Age", "Fare", "Pclass_1", "Pclass_2", "Pclass_3", "Emb_1", "Emb_2", "Emb_3"]]
y = df["Survived"]

In [None]:
# Step 4: Define which columns are numerical and which are categorical
numeric_features = ["Age", "Fare"]         # These will be scaled and imputed
categorical_features = ["Sex", "Pclass_1", "Pclass_2", "Pclass_3", "Emb_1", "Emb_2", "Emb_3"]           # These will be encoded

In [None]:
# Step 5: Create a pipeline for numeric features
# - Impute missing numeric values with the median
# - Scale numeric values to have mean=0 and std=1
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # Fill missing numeric values
    ("scaler", StandardScaler())                     # Normalize numeric features
])

In [None]:
# Step 6: Create a pipeline for categorical features
# - Impute missing values with the most frequent category
# - Convert categorical values into numerical using OneHotEncoder
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))    # Convert to numeric using one-hot encoding
])

In [None]:
# Step 7: Combine both numeric and categorical pipelines using ColumnTransformer
# This applies each pipeline only to its respective columns
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numeric_features),
    ("cat", cat_pipeline, categorical_features)
])

In [None]:
# Step 8: Apply preprocessing to the features
# This returns a transformed version of X with all features numeric and scaled/encoded
X_processed = preprocessor.fit_transform(X)

In [None]:
# Step 9: Split the data into training and test sets (80% train, 20% test)
# Random_state is set for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [None]:
# Step 10: Display the shape of train and test sets
print("Data Split Summary")
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape : {X_test.shape}")
print(f"Training Labels Count  : {len(y_train)}")
print(f"Testing Labels Count   : {len(y_test)}\n")

# Show first few training labels with indexing
print("Sample of y_train (Target Labels for Training):")
print(pd.Series(y_train.values).head(10).to_string(index=True))

print("\nSample of y_test (Target Labels for Testing):")
print(pd.Series(y_test.values).head(10).to_string(index=True))

# Show a sample from X_train in table format (as a DataFrame)
print("\nSample of Processed Training Features (X_train):")
X_train_df = pd.DataFrame(X_train[:5].toarray() if hasattr(X_train, 'toarray') else X_train[:5])
print(X_train_df.round(2).to_string(index=False))

Data Split Summary
Training Features Shape: (633, 16)
Testing Features Shape : (159, 16)
Training Labels Count  : 633
Testing Labels Count   : 159

Sample of y_train (Target Labels for Training):
0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    0
8    0
9    1

Sample of y_test (Target Labels for Testing):
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    1

Sample of Processed Training Features (X_train):
   0     1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
-0.11 -0.39 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
 0.04 -0.50 0.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0
 1.04 -0.13 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
-0.11 -0.49 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0
-1.95 -0.38 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0
