In [19]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [20]:
train_file_path = '../Data/train.csv'
test_file_path = '../Data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, (preds > 0.5).astype(int))

cols_with_missing_values = [col for col in train_data.columns if train_data[col].isnull().any()]

print(train_data.shape)
train_data[cols_with_missing_values].isnull().sum()

(891, 12)


Age         177
Cabin       687
Embarked      2
dtype: int64

- Remove Cabin Column
- Remove rows with missing values of Embarked
- Impute values for Age

In [21]:
train_data = train_data.drop(columns=['Cabin'], axis=1)
test_data = test_data.drop(columns=['Cabin'], axis=1)

train_data = train_data.dropna(subset=['Embarked'])
test_data = test_data.dropna(subset=['Embarked'])

imputer = SimpleImputer(strategy='median')
train_data['Age'] = imputer.fit_transform(train_data[['Age']])
test_data['Age'] = imputer.transform(test_data[['Age']])

In [22]:
# Categorical variables, and cardinality or the categorical variables
print("Train Data Columns:", train_data.columns)

categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']
print("Categorical Columns:", categorical_cols)

low_cardinality_cols = [col for col in categorical_cols if train_data[col].nunique() < 10]
print("Low Cardinality Columns:", low_cardinality_cols)

numeric_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]
print("Numeric Columns:", numeric_cols)

all_cols = low_cardinality_cols + numeric_cols
all_cols = set(all_cols) - set(['Survived'])
all_cols = list(all_cols)

print("All Selected Columns:", all_cols)

print("Ticket Carditnality:", train_data['Ticket'].nunique())

Train Data Columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')
Categorical Columns: ['Name', 'Sex', 'Ticket', 'Embarked']
Low Cardinality Columns: ['Sex', 'Embarked']
Numeric Columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
All Selected Columns: ['PassengerId', 'Embarked', 'Age', 'Fare', 'Parch', 'SibSp', 'Sex', 'Pclass']
Ticket Carditnality: 680


In [23]:
y = train_data.Survived
X = train_data[all_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [24]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

OE = OrdinalEncoder()
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

label_X_train[low_cardinality_cols] = OE.fit_transform(X_train[low_cardinality_cols])
label_X_valid[low_cardinality_cols] = OE.transform(X_valid[low_cardinality_cols])

OH = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_X_train = X_train.copy()
OH_X_valid = X_valid.copy()

OH_X_train = pd.DataFrame(OH.fit_transform(X_train[low_cardinality_cols]))
OH_X_valid = pd.DataFrame(OH.transform(X_valid[low_cardinality_cols]))

OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

In [25]:
OE_preds = score_dataset(label_X_train, label_X_valid, y_train, y_valid)
print("Accuracy (Ordinal Encoding):", OE_preds)

OH_preds = score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)
print("Accuracy (One-Hot Encoding):", OH_preds)

Accuracy (Ordinal Encoding): 0.7528089887640449
Accuracy (One-Hot Encoding): 0.7191011235955056


## Data Leakage Fix

**Problem Fixed**: The previous approach trained a model on the full dataset and then evaluated it on a subset of that same data, causing data leakage and artificially high accuracy (1.0).

**Solution**: 
1. Use proper train/validation split for model evaluation
2. Train final model on full dataset only for submission (no evaluation)
3. Use separate encoder for final model to avoid contamination

In [26]:
# Train final model on full dataset for submission (no evaluation to avoid leakage)
full_X_train = train_data[all_cols]

# Re-fit encoder on full training data for final model
label_X_full = full_X_train.copy()
OE_full = OrdinalEncoder()
label_X_full[low_cardinality_cols] = OE_full.fit_transform(full_X_train[low_cardinality_cols])

# Train final model on full dataset
final_model = RandomForestRegressor(n_estimators=100, random_state=0)
final_model.fit(label_X_full, y)

print("Final model trained on full dataset - ready for predictions")

Final model trained on full dataset - ready for predictions


In [27]:
# Make predictions on test data using the final model
test_X = test_data[all_cols]

label_X_test = test_X.copy()
label_X_test[low_cardinality_cols] = OE_full.transform(test_X[low_cardinality_cols])

OE_test_preds = final_model.predict(label_X_test)
OE_test_preds = (OE_test_preds > 0.5).astype(int)

submission = pd.DataFrame({
    "PassengerId": test_data.PassengerId,
    "Survived": OE_test_preds
})

submission.to_csv('titanic_OE_submission.csv', index=False)
print("Submission file created: titanic_OE_submission.csv")

Submission file created: titanic_OE_submission.csv
