In [None]:
#Project Title: Feature Optimization for Classification Problems using Recursive Feature Elimination (RFE)

# --- Import libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Step 1: Load dataset ---
data = pd.read_csv("/content/titanic.csv")   # make sure Titanic dataset is in same folder

# --- Step 2: Data preprocessing ---
# Drop irrelevant columns
data = data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Fill missing values
data["Age"].fillna(data["Age"].median(), inplace=True)
data["Embarked"].fillna(data["Embarked"].mode()[0], inplace=True)

# Convert categorical variables into dummy variables
data = pd.get_dummies(data, drop_first=True)

# --- Step 3: Split into features and target ---
X = data.drop("Survived", axis=1)
y = data["Survived"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- Step 4: Baseline Model (all features) ---
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)

# --- Step 5: RFE for feature selection ---
estimator = LogisticRegression(max_iter=1000)
rfe = RFE(estimator, n_features_to_select=5)   # choose top 5 features
rfe.fit(X_train, y_train)

# Transform dataset to selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# --- Step 6: Optimized Model (RFE features only) ---
optimized_model = LogisticRegression(max_iter=1000)
optimized_model.fit(X_train_rfe, y_train)
y_pred_optimized = optimized_model.predict(X_test_rfe)

# --- Step 7: Compare Results ---
print("Baseline Model (All Features):")
print("Accuracy:", accuracy_score(y_test, y_pred_baseline))
print("Precision:", precision_score(y_test, y_pred_baseline))
print("Recall:", recall_score(y_test, y_pred_baseline))
print("F1 Score:", f1_score(y_test, y_pred_baseline))

print("\nOptimized Model (RFE Features):")
print("Accuracy:", accuracy_score(y_test, y_pred_optimized))
print("Precision:", precision_score(y_test, y_pred_optimized))
print("Recall:", recall_score(y_test, y_pred_optimized))
print("F1 Score:", f1_score(y_test, y_pred_optimized))

# --- Step 8: Print selected features ---
selected_features = X.columns[rfe.support_]
print("\nSelected Features by RFE:", selected_features.tolist())


Baseline Model (All Features):
Accuracy: 0.8100558659217877
Precision: 0.7857142857142857
Recall: 0.7432432432432432
F1 Score: 0.7638888888888888

Optimized Model (RFE Features):
Accuracy: 0.8044692737430168
Precision: 0.7746478873239436
Recall: 0.7432432432432432
F1 Score: 0.7586206896551724

Selected Features by RFE: ['Pclass', 'Age', 'SibSp', 'Sex_male', 'Embarked_S']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Age"].fillna(data["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Embarked"].fillna(data["Embarked"].mode()[0], inplace=True)
