# Titanic Baseline Experiment

Simple baseline using LightGBM with basic feature engineering.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Columns: {train.columns.tolist()}")

In [None]:
# Basic feature engineering
def preprocess_data(df):
    df = df.copy()
    
    # Extract title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Simplify titles
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    # Fill missing Age based on Title and Pclass
    df['Age'] = df.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'] = df['Age'].fillna(df['Age'].median())
    
    # Create Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 30, 50, 80], labels=['Child', 'Teen', 'Young', 'Middle', 'Senior'])
    
    # Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Fill missing Fare with median
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Extract deck from Cabin
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('Unknown')
    
    # Family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    return df

# Preprocess both datasets
combined = pd.concat([train.drop('Survived', axis=1), test], axis=0, ignore_index=True)
combined = preprocess_data(combined)

# Split back
train_processed = combined.iloc[:len(train)].copy()
test_processed = combined.iloc[len(train):].copy()
train_processed['Survived'] = train['Survived'].values

print("Preprocessing completed")
print(f"Missing values in train:\n{train_processed.isnull().sum()}")
print(f"Missing values in test:\n{test_processed.isnull().sum()}")

In [None]:
# Select features for modeling
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'AgeGroup', 'Deck']
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'IsAlone']

feature_columns = categorical_features + numerical_features

# One-hot encode categorical features
train_features = pd.get_dummies(train_processed[feature_columns], columns=categorical_features, drop_first=True)
test_features = pd.get_dummies(test_processed[feature_columns], columns=categorical_features, drop_first=True)

# Align features (ensure same columns in both datasets)
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

X = train_features.values
y = train_processed['Survived'].values
X_test = test_features.values

print(f"Feature matrix shape: {X.shape}")
print(f"Test feature matrix shape: {X_test.shape}")
print(f"Number of features: {X.shape[1]}")

In [None]:
# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
fold_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

# Model parameters (conservative baseline)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

print("Starting 5-fold cross-validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred_binary = (val_pred > 0.5).astype(int)
    
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store results
    oof_predictions[val_idx] = val_pred_binary
    test_predictions += test_pred / 5
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, val_pred_binary)
    fold_scores.append(accuracy)
    
    print(f"Fold {fold + 1}: Accuracy = {accuracy:.4f}")

# Overall CV score
overall_accuracy = accuracy_score(y, oof_predictions)
print(f"\nOverall CV Accuracy: {overall_accuracy:.4f}")
print(f"Mean CV Accuracy: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': (test_predictions > 0.5).astype(int)
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)

print("Submission file created:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Survival rate in submission: {submission['Survived'].mean():.3f}")

# Also save OOF predictions for analysis
oof_df = pd.DataFrame({
    'PassengerId': train['PassengerId'],
    'Survived': y,
    'Survived_Pred': oof_predictions
})
oof_df.to_csv('/home/code/experiments/001_baseline/oof_predictions.csv', index=False)

print(f"\nOOF predictions saved. Shape: {oof_df.shape}")