# Titanic Baseline Model - Phase 1

Following the seed prompt strategy to establish baseline performance with simple preprocessing and basic models.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("Libraries imported successfully")

Libraries imported successfully


## Load and Explore Data

In [2]:
# Load the data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())
print("\nFirst few rows:")
train_df.head()

Training data shape: (891, 12)
Test data shape: (418, 11)

Training data columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Basic data info
print("Training data info:")
train_df.info()
print("\nMissing values in training data:")
print(train_df.isnull().sum())

print("\nTarget distribution:")
print(train_df['Survived'].value_counts(normalize=True))

## Basic Preprocessing

Following the strategy: handle missing values, encode categorical variables, create simple features.

In [None]:
def preprocess_data(df, is_train=True):
    """Basic preprocessing function"""
    df = df.copy()
    
    # Create a copy to avoid SettingWithCopyWarning
    df_processed = df.copy()
    
    # 1. Extract Title from Name (high-impact feature from strategy)
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Sir', 'Lady', 'Capt', 'Don', 'Jonkheer']
    df_processed['Title'] = df_processed['Title'].replace(rare_titles, 'Rare')
    df_processed['Title'] = df_processed['Title'].replace('Mlle', 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Ms', 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    
    # 2. Create FamilySize feature
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    
    # 3. Create IsAlone feature
    df_processed['IsAlone'] = (df_processed['FamilySize'] == 1).astype(int)
    
    # 4. Create Age bands
    df_processed['AgeBand'] = pd.cut(df_processed['Age'], 
                                   bins=[0, 12, 18, 35, 60, 100], 
                                   labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
    
    # 5. Create Fare bands
    df_processed['FareBand'] = pd.qcut(df_processed['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
    
    # 6. Extract Cabin deck
    df_processed['CabinDeck'] = df_processed['Cabin'].str[0]
    df_processed['CabinDeck'] = df_processed['CabinDeck'].fillna('Unknown')
    
    # 7. Create HasCabin feature
    df_processed['HasCabin'] = (df_processed['Cabin'].notna()).astype(int)
    
    # Handle missing values
    # Age: fill with median by Title
    age_medians = df_processed.groupby('Title')['Age'].median()
    for title, median_age in age_medians.items():
        mask = (df_processed['Title'] == title) & (df_processed['Age'].isna())
        df_processed.loc[mask, 'Age'] = median_age
    
    # Fill any remaining Age NaNs with overall median
    df_processed['Age'] = df_processed['Age'].fillna(df_processed['Age'].median())
    
    # Embarked: fill with mode
    df_processed['Embarked'] = df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0])
    
    # Fare: fill with median by Pclass
    if is_train:
        fare_medians = df_processed.groupby('Pclass')['Fare'].median()
        for pclass, median_fare in fare_medians.items():
            mask = (df_processed['Pclass'] == pclass) & (df_processed['Fare'].isna())
            df_processed.loc[mask, 'Fare'] = median_fare
    else:
        # For test set, use training medians (we'll handle this separately)
        df_processed['Fare'] = df_processed['Fare'].fillna(df_processed['Fare'].median())
    
    # Encode categorical variables
    # Sex: binary encoding
    df_processed['Sex'] = df_processed['Sex'].map({'female': 1, 'male': 0})
    
    # Embarked: one-hot encoding
    embarked_dummies = pd.get_dummies(df_processed['Embarked'], prefix='Embarked')
    df_processed = pd.concat([df_processed, embarked_dummies], axis=1)
    
    # Title: label encoding
    le = LabelEncoder()
    df_processed['Title'] = le.fit_transform(df_processed['Title'])
    
    # AgeBand: label encoding
    df_processed['AgeBand'] = df_processed['AgeBand'].cat.codes
    
    # FareBand: label encoding
    df_processed['FareBand'] = df_processed['FareBand'].cat.codes
    
    # CabinDeck: label encoding
    df_processed['CabinDeck'] = df_processed['CabinDeck'].map({
        'Unknown': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8
    })
    
    # Select features for modeling
    feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone',
                   'Title', 'AgeBand', 'FareBand', 'CabinDeck', 'HasCabin',
                   'Embarked_C', 'Embarked_Q', 'Embarked_S']
    
    return df_processed[feature_cols]

# Preprocess both datasets
print("Preprocessing training data...")
X_train = preprocess_data(train_df, is_train=True)
y_train = train_df['Survived']

print("Preprocessing test data...")
X_test = preprocess_data(test_df, is_train=False)

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("\nFeature columns:", X_train.columns.tolist())

## Model Training - Baseline Models

Following the strategy: start with Logistic Regression and Random Forest as baseline models.

In [None]:
# Set up cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

print("=== BASELINE MODELS ===\n")

# 1. Logistic Regression (simple baseline)
print("1. Logistic Regression:")
lr_model = LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)
lr_scores = cross_val_score(lr_model, X_train, y_train, cv=skf, scoring='accuracy')
print(f"CV Accuracy: {lr_scores.mean():.4f} ± {lr_scores.std():.4f}")

# 2. Random Forest (excellent default performance)
print("\n2. Random Forest:")
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=skf, scoring='accuracy')
print(f"CV Accuracy: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")

# Gender-only baseline for comparison
print("\n3. Gender-only baseline (female=1, male=0):")
gender_pred = (train_df['Sex'] == 'female').astype(int)
gender_accuracy = accuracy_score(y_train, gender_pred)
print(f"Accuracy: {gender_accuracy:.4f}")

print(f"\n=== SUMMARY ===")
print(f"Gender-only baseline: {gender_accuracy:.4f}")
print(f"Logistic Regression:  {lr_scores.mean():.4f} ± {lr_scores.std():.4f}")
print(f"Random Forest:        {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")

## Train Final Models and Generate Predictions

Train on full training data and generate predictions for test set.

In [None]:
# Train final models on full training data
print("Training final models on full training data...")

# Logistic Regression
lr_final = LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)
lr_final.fit(X_train, y_train)
lr_pred = lr_final.predict(X_test)

# Random Forest
rf_final = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf_final.fit(X_train, y_train)
rf_pred = rf_final.predict(X_test)

print("Models trained successfully!")

# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Create submission files
print("Creating submission files...")

# Logistic Regression submission
lr_submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': lr_pred
})
lr_submission.to_csv('/home/submission/submission_lr_baseline.csv', index=False)

# Random Forest submission
rf_submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': rf_pred
})
rf_submission.to_csv('/home/submission/submission_rf_baseline.csv', index=False)

print("Submission files created:")
print("- /home/submission/submission_lr_baseline.csv")
print("- /home/submission/submission_rf_baseline.csv")

# Verify submission format
print(f"\nSubmission shape: {rf_submission.shape}")
print("First few rows:")
print(rf_submission.head())

# Check for any issues
print(f"\nPrediction distribution:")
print(f"Survived=0: {(rf_pred == 0).sum()}")
print(f"Survived=1: {(rf_pred == 1).sum()}")

## Summary

This baseline experiment implements:
- Basic preprocessing with missing value imputation
- Key feature engineering (Title, FamilySize, IsAlone, Age/Fare bands, Cabin features)
- Two baseline models: Logistic Regression and Random Forest
- 5-fold stratified cross-validation
- Submission files for both models

Next steps: Tune hyperparameters, try XGBoost, and explore ensembling.