# Feature Engineering and Selection

#### Setup, Data Loading, and Initial Cleaning

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/kaggle/input/titanic/titanic.csv")

In [3]:
df.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)
print("Initial Data Snapshot:")
print(df.head())

Initial Data Snapshot:
   Survived  Pclass                                               Name  \
0         0       3                            Braund, Mr. Owen Harris   
1         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2         1       3                             Heikkinen, Miss. Laina   
3         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4         0       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch     Fare Cabin Embarked  
0    male  22.0      1      0   7.2500   NaN        S  
1  female  38.0      1      0  71.2833   C85        C  
2  female  26.0      0      0   7.9250   NaN        S  
3  female  35.0      1      0  53.1000  C123        S  
4    male  35.0      0      0   8.0500   NaN        S  


#### Initial Data Cleaning

In [4]:
df.fillna({'Age': df['Age'].median()}, inplace=True)
#df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.fillna({'Embarked' : df['Embarked'].mode()[0]}, inplace=True)

In [5]:
# dropping cabin feature, because most of it's values are missing
df.drop('Cabin', axis=1, inplace=True)

In [6]:
y = df['Survived']
X = df.drop('Survived', axis=1)

print("\nMissing values handled and Target separated.")


Missing values handled and Target separated.


#### Feature Engineering

In [7]:
# --- 1. Feature Extraction (Title from Name) ---
X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
X.drop('Name', axis=1, inplace=True)

In [8]:
# Group rare titles
rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
X['Title'] = X['Title'].replace(rare_titles, 'Rare')
X['Title'] = X['Title'].replace('Mlle', 'Miss')
X['Title'] = X['Title'].replace('Ms', 'Miss')
X['Title'] = X['Title'].replace('Mme', 'Mrs')

In [9]:
# --- 2. Interaction Feature (Family Size) ---
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1 # SibSp=Siblings/Spouses, Parch=Parents/Children
X['IsAlone'] = 0
X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1
X.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [10]:
# --- 3. Binning/Grouping (Age) ---
X['AgeGroup'] = pd.cut(X['Age'], bins=[0, 18, 30, 50, 80], labels=['Child', 'Young Adult', 'Adult', 'Senior'])
X.drop('Age', axis=1, inplace=True)

In [11]:
# --- 4. Encoding ---
X_processed = pd.get_dummies(X, columns=['Pclass', 'Sex', 'Embarked', 'Title', 'AgeGroup'], drop_first=True)

print("\nEngineered Features Snapshot:")
print(X_processed.head())
print(f"Total features after engineering: {X_processed.shape[1]}")

# Split and Scale for Feature Selection
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_processed.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_processed.columns)


Engineered Features Snapshot:
      Fare  FamilySize  IsAlone  Pclass_2  Pclass_3  Sex_male  Embarked_Q  \
0   7.2500           2        0     False      True      True       False   
1  71.2833           2        0     False     False     False       False   
2   7.9250           1        1     False      True     False       False   
3  53.1000           2        0     False     False     False       False   
4   8.0500           1        1     False      True      True       False   

   Embarked_S  Title_Miss  Title_Mr  Title_Mrs  Title_Rare  \
0        True       False      True      False       False   
1       False       False     False       True       False   
2        True        True     False      False       False   
3        True       False     False       True       False   
4        True       False      True      False       False   

   AgeGroup_Young Adult  AgeGroup_Adult  AgeGroup_Senior  
0                  True           False            False  
1              

#### Feature Selection - Filter Method (SelectKBest)

In [12]:
k_best_selector = SelectKBest(score_func=f_classif, k=10) 
k_best_selector.fit(X_train_scaled_df, y_train)

# Get the scores and feature names
feature_scores = pd.Series(k_best_selector.scores_, index=X_train_scaled_df.columns)
selected_k_best_features = feature_scores.nlargest(10).index.tolist()

print("--- Filter Method: SelectKBest (Top 10) ---")
print("Selected Features based on F-Value:")
print(selected_k_best_features)

--- Filter Method: SelectKBest (Top 10) ---
Selected Features based on F-Value:
['Title_Mr', 'Sex_male', 'Title_Mrs', 'Title_Miss', 'Pclass_3', 'Fare', 'IsAlone', 'Embarked_S', 'AgeGroup_Young Adult', 'Pclass_2']


In [13]:
# Compare models with all features vs. KBest features
logreg_all = LogisticRegression(random_state=42).fit(X_train_scaled, y_train)
logreg_kbest = LogisticRegression(random_state=42).fit(
    X_train_scaled_df[selected_k_best_features], y_train
)

acc_all = accuracy_score(y_test, logreg_all.predict(X_test_scaled))
acc_kbest = accuracy_score(y_test, logreg_kbest.predict(X_test_scaled_df[selected_k_best_features]))

print(f"Accuracy (All Features): {acc_all:.4f}")
print(f"Accuracy (SelectKBest Features): {acc_kbest:.4f}")

Accuracy (All Features): 0.8246
Accuracy (SelectKBest Features): 0.7985


#### Feature Selection - Wrapper Method (RFE)

In [14]:
logreg_rfe = LogisticRegression(random_state=42)

rfe_selector = RFE(estimator=logreg_rfe, n_features_to_select=10, step=1)
rfe_selector.fit(X_train_scaled, y_train)

selected_rfe_mask = rfe_selector.support_
selected_rfe_features = X_processed.columns[selected_rfe_mask].tolist()

print("\n--- Wrapper Method: Recursive Feature Elimination (Top 10) ---")
print("Selected Features based on RFE:")
print(selected_rfe_features)

# Final comparison using the RFE-selected features
logreg_rfe_final = LogisticRegression(random_state=42).fit(
    X_train_scaled_df[selected_rfe_features], y_train
)

acc_rfe = accuracy_score(y_test, logreg_rfe_final.predict(X_test_scaled_df[selected_rfe_features]))

print(f"Accuracy (RFE Selected Features): {acc_rfe:.4f}")


--- Wrapper Method: Recursive Feature Elimination (Top 10) ---
Selected Features based on RFE:
['Fare', 'FamilySize', 'IsAlone', 'Pclass_3', 'Sex_male', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare']
Accuracy (RFE Selected Features): 0.8172


## $\text{RFE}$ model offers a much simpler model with 50% fewer features for a marginal (less than 1%) trade-off in accuracy, making it a better choice for production due to maintenance and interpretability.