# Titanic Baseline: LightGBM

First baseline experiment using LightGBM with basic feature engineering.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import os

# Create experiments directory
os.makedirs('/home/code/experiments', exist_ok=True)

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nTrain columns:", train_df.columns.tolist())
print("\nTest columns:", test_df.columns.tolist())

In [None]:
# Basic feature engineering
def engineer_features(df):
    df = df.copy()
    
    # Fill missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # Extract titles from names
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Simplify titles
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Cabin features
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df['CabinLetter'] = df['Cabin'].str[0]
    df['CabinLetter'] = df['CabinLetter'].fillna('Unknown')
    
    return df

# Apply feature engineering
train_processed = engineer_features(train_df)
test_processed = engineer_features(test_df)

print("Features engineered successfully")
print("\nSample of new features:")
print(train_processed[['Title', 'FamilySize', 'IsAlone', 'AgeGroup', 'FarePerPerson', 'HasCabin']].head())

In [None]:
# Prepare data for modeling
# Define categorical and numerical features
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'AgeGroup', 'CabinLetter']
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'HasCabin']

# Combine all features
feature_columns = categorical_features + numerical_features

# Create feature matrices
X = train_processed[feature_columns]
y = train_processed['Survived']
X_test = test_processed[feature_columns]

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"\nFeatures: {feature_columns}")

In [None]:
# Encode categorical features
from sklearn.preprocessing import LabelEncoder

X_encoded = X.copy()
X_test_encoded = X_test.copy()

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Fit on combined data to handle unseen categories
    combined_data = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined_data)
    
    X_encoded[col] = le.transform(X[col])
    X_test_encoded[col] = le.transform(X_test[col])
    
    label_encoders[col] = le

print("Categorical features encoded")
print("\nSample of encoded data:")
print(X_encoded.head())

In [None]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results
cv_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

print(f"Starting {n_splits}-fold cross-validation...")

fold = 1
for train_idx, val_idx in skf.split(X_encoded, y):
    print(f"\nFold {fold}/{n_splits}")
    
    X_train, X_val = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Parameters
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred_binary = (val_pred > 0.5).astype(int)
    
    test_pred = model.predict(X_test_encoded, num_iteration=model.best_iteration)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, val_pred_binary)
    cv_scores.append(accuracy)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred_binary
    
    # Accumulate test predictions
    test_predictions += test_pred / n_splits
    
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    
    fold += 1

# Overall CV score
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f"\n{'='*50}")
print(f"Cross-Validation Results:")
print(f"Mean Accuracy: {mean_cv_score:.4f} Â± {std_cv_score:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")
print(f"{'='*50}")

# OOF accuracy
oof_accuracy = accuracy_score(y, oof_predictions)
print(f"OOF Accuracy: {oof_accuracy:.4f}")

In [None]:
# Create submission
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': (test_predictions > 0.5).astype(int)
})

# Save submission
submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"\nSubmission shape: {submission_df.shape}")
print("\nFirst 10 rows of submission:")
print(submission_df.head(10))

# Verify submission format
print(f"\nSubmission columns: {submission_df.columns.tolist()}")
print(f"Unique values in Survived: {submission_df['Survived'].unique()}")
print(f"Value counts:\n{submission_df['Survived'].value_counts()}")

In [None]:
# Feature importance
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X_encoded.columns.tolist()

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(10))

# Save results
results = {
    'cv_scores': cv_scores,
    'mean_cv_score': mean_cv_score,
    'std_cv_score': std_cv_score,
    'oof_accuracy': oof_accuracy,
    'feature_importance': importance_df.to_dict('records')
}

import json
with open('/home/code/experiments/001_baseline_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nResults saved to: /home/code/experiments/001_baseline_results.json")