In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 2: Create a sample dataset
data = {
    'age': [25, 30, np.nan, 22, 28],
    'salary': [50000, 60000, 52000, np.nan, 58000],
    'gender': ['Male', 'Female', 'Female', 'Male', np.nan],
    'purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(data)
print("Original Data:\n", df)

# Step 3: Separate features and target
X = df.drop('purchased', axis=1)
y = df['purchased']

# Step 4: Define preprocessing for numeric and categorical features
numeric_features = ['age', 'salary']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 5: Combine transformations using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Step 6: Apply transformations
X_processed = preprocessor.fit_transform(X)

# Step 7: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Step 8: Show results
print("\nProcessed Feature Data (X):\n", X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed)
print("\nTrain/Test Split Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


Original Data:
     age   salary  gender purchased
0  25.0  50000.0    Male       Yes
1  30.0  60000.0  Female        No
2   NaN  52000.0  Female       Yes
3  22.0      NaN    Male        No
4  28.0  58000.0     NaN       Yes

Processed Feature Data (X):
 [[-0.46106945 -1.35581536  0.          1.        ]
 [ 1.38320834  1.35581536  1.          0.        ]
 [ 0.         -0.81348922  1.          0.        ]
 [-1.56763612  0.          0.          1.        ]
 [ 0.64549722  0.81348922  1.          0.        ]]

Train/Test Split Shapes:
X_train: (4, 4)
X_test: (1, 4)
y_train: (4,)
y_test: (1,)
