In [1]:
# ### Step 0: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
file_path = '../datasets/train.csv'
df = pd.read_csv(file_path)

df_processed = df.copy()

In [3]:
# ### Step 1: Initial Data Cleaning (from Part 2)
print("Starting initial data cleaning...")
# Impute Age with the median
df_processed['Age'] = df_processed['Age'].fillna(df_processed['Age'].median())

# Impute Embarked with the mode
df_processed['Embarked'] = df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0])

# Drop the Cabin column and other unnecessary columns
df_processed = df_processed.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)
print("Initial cleaning complete.")

Starting initial data cleaning...
Initial cleaning complete.


In [4]:
# ### Step 2: Feature Engineering (Creating New Features)
print("Performing feature engineering...")
# Create FamilySize
df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1

# Create IsAlone
df_processed['IsAlone'] = (df_processed['FamilySize'] == 1).astype(int)

# Extract and clean Titles from Name
df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_processed['Title'] = df_processed['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_processed['Title'] = df_processed['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

# We no longer need the Name, SibSp, or Parch columns
df_processed = df_processed.drop(['Name', 'SibSp', 'Parch'], axis=1)
print("New features created.")

Performing feature engineering...
New features created.


  df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [5]:
# ### Step 3: Final Data Preparation (Encoding)
print("Performing final encoding...")

# Map the Title column to numbers
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
df_processed['Title'] = df_processed['Title'].map(title_mapping)

# In case any title was missed
df_processed['Title'] = df_processed['Title'].fillna(0)

# Map Sex to numbers
df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1})

# One-Hot Encode the Embarked column
df_processed = pd.get_dummies(df_processed, columns=['Embarked'], drop_first=True, dtype=int)
print("Encoding complete.")

Performing final encoding...
Encoding complete.


In [6]:
# ### Step 4: Define Feature Sets and Scale Them
print("Preparing feature sets for modeling...")
# Define the two different sets of features we want to test
original_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked_Q', 'Embarked_S']
new_features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Title', 'Embarked_Q', 'Embarked_S']

# Create our dataframes
X_original = df_processed[original_features]
X_new = df_processed[new_features]
y = df_processed['Survived']

# Scale the features
scaler = StandardScaler()
X_original_scaled = scaler.fit_transform(X_original)
X_new_scaled = scaler.fit_transform(X_new)

# Convert scaled arrays back to DataFrames for clarity
X_original_scaled = pd.DataFrame(X_original_scaled, columns=original_features)
X_new_scaled = pd.DataFrame(X_new_scaled, columns=new_features)
print("Scaling complete.")

Preparing feature sets for modeling...
Scaling complete.


In [7]:
# ### Step 5: Train and Evaluate All Four Models
print("\n--- Starting Model Showdown ---")
# Split both feature sets
X_orig_train, X_orig_test, y_train, y_test = train_test_split(X_original_scaled, y, test_size=0.2, random_state=42)
X_new_train, X_new_test, _, _ = train_test_split(X_new_scaled, y, test_size=0.2, random_state=42)

# --- Train and evaluate models ---
# 1. Logistic Regression (Original Features)
lr_orig = LogisticRegression(random_state=42).fit(X_orig_train, y_train)
lr_orig_accuracy = accuracy_score(y_test, lr_orig.predict(X_orig_test))

# 2. Logistic Regression (New Features)
lr_new = LogisticRegression(random_state=42).fit(X_new_train, y_train)
lr_new_accuracy = accuracy_score(y_test, lr_new.predict(X_new_test))

# 3. Random Forest (Original Features)
rf_orig = RandomForestClassifier(random_state=42).fit(X_orig_train, y_train)
rf_orig_accuracy = accuracy_score(y_test, rf_orig.predict(X_orig_test))

# 4. Random Forest (New Features)
rf_new = RandomForestClassifier(random_state=42).fit(X_new_train, y_train)
rf_new_accuracy = accuracy_score(y_test, rf_new.predict(X_new_test))

print("--- Model Showdown Complete ---")

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b



--- Starting Model Showdown ---
--- Model Showdown Complete ---


In [8]:
# ### Step 6: Display the Results in a Clean Table
print("\n--- Final Results ---")
results_data = {
    "Model": ["Logistic Regression", "Logistic Regression", "Random Forest", "Random Forest"],
    "Features": ["Original", "New & Improved", "Original", "New & Improved"],
    "Accuracy": [lr_orig_accuracy, lr_new_accuracy, rf_orig_accuracy, rf_new_accuracy]
}

results_df = pd.DataFrame(results_data)

# Format the accuracy as a percentage
results_df['Accuracy'] = (results_df['Accuracy'] * 100).map('{:.2f}%'.format)

print(results_df)


--- Final Results ---
                 Model        Features Accuracy
0  Logistic Regression        Original   79.89%
1  Logistic Regression  New & Improved   78.77%
2        Random Forest        Original   79.33%
3        Random Forest  New & Improved   82.68%
