In [None]:
# Feature Engineering Example - Titanic Dataset
Name: Vishal Shende
PRN:202401110034
AIML(A2)
# ---------------------------------------------
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif


# Load the dataset
data = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")


print("✅ Dataset Loaded Successfully!\n")
print(data.head())


# ---------------------------------------------
# Step 1: Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())


# ---------------------------------------------
# Step 2: Handle missing values using imputation
imputer = SimpleImputer(strategy='mean')
data['Age'] = imputer.fit_transform(data[['Age']])
data['Fare'] = imputer.fit_transform(data[['Fare']])
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


# ---------------------------------------------
# Step 3: Encode categorical variables
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])  # Male=1, Female=0
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)  # One-hot encoding


# ---------------------------------------------
# Step 4: Feature scaling
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])


# ---------------------------------------------
# Step 5: Dimensionality reduction using PCA (optional)
pca = PCA(n_components=2)
pca_features = pca.fit_transform(data[['Age', 'Fare']])
data['PCA1'] = pca_features[:, 0]
data['PCA2'] = pca_features[:, 1]


# ---------------------------------------------
# Step 6: Feature selection (SelectKBest)
X = data[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']]
y = data['Survived']


selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)


selected_features = X.columns[selector.get_support()]
print("\nSelected Important Features:", selected_features.tolist())


# ---------------------------------------------
# Step 7: Summary
print("\n✅ Feature Engineering Completed Successfully!")
print("Transformed dataset shape:", data.shape)
print("Few rows after transformation:")
print(data.head())
