# Titanic Baseline Model

This notebook creates a baseline model for the Titanic competition using gradient boosting.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nTraining data info:")
train_df.info()

## Basic Preprocessing

In [None]:
# Separate target and features
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']
X_test = test_df.drop(['PassengerId'], axis=1)

# Combine for preprocessing
combined = pd.concat([X, X_test], axis=0)

print("Missing values before preprocessing:")
print(combined.isnull().sum())

In [None]:
# Fill missing values
# Age: fill with median
combined['Age'].fillna(combined['Age'].median(), inplace=True)

# Embarked: fill with mode
combined['Embarked'].fillna(combined['Embarked'].mode()[0], inplace=True)

# Fare: fill with median
combined['Fare'].fillna(combined['Fare'].median(), inplace=True)

# Cabin: create binary feature (has cabin or not)
combined['HasCabin'] = (combined['Cabin'].notna()).astype(int)
combined.drop('Cabin', axis=1, inplace=True)

print("Missing values after preprocessing:")
print(combined.isnull().sum())

In [None]:
# Extract titles from names
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Map rare titles to more common ones
title_mapping = {
    'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
    'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
    'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
    'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
    'Capt': 'Other', 'Sir': 'Other'
}
combined['Title'] = combined['Title'].map(title_mapping)

# Drop Name and Ticket (high cardinality)
combined.drop(['Name', 'Ticket'], axis=1, inplace=True)

print("Unique titles:", combined['Title'].unique())
print("Title counts:", combined['Title'].value_counts())

In [None]:
# Encode categorical variables
categorical_cols = ['Sex', 'Embarked', 'Title', 'Pclass']

for col in categorical_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

print("Final feature shapes:")
print(combined.shape)
print("\nFeature types:")
print(combined.dtypes.value_counts())

## Create Family Size Feature

In [None]:
# Create family size feature
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

# Create family size categories
combined['FamilySizeCategory'] = pd.cut(combined['FamilySize'], 
                                       bins=[0, 1, 4, 20], 
                                       labels=['Single', 'Small', 'Large'])

# Encode family size category
le_fs = LabelEncoder()
combined['FamilySizeCategory'] = le_fs.fit_transform(combined['FamilySizeCategory'])

print("Family size distribution:")
print(combined['FamilySizeCategory'].value_counts())

## Prepare Data for Modeling

In [None]:
# Split back into train and test
X_processed = combined.iloc[:len(X), :]
X_test_processed = combined.iloc[len(X):, :]

print(f"Processed training data shape: {X_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# Verify no missing values
print(f"\nMissing values in training: {X_processed.isnull().sum().sum()}")
print(f"Missing values in test: {X_test_processed.isnull().sum().sum()}")

## Model Training with Cross-Validation

In [None]:
# Set up cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize model
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# Cross-validation scores
cv_scores = []
fold = 1

print("Training with 5-fold cross-validation...")
for train_idx, val_idx in skf.split(X_processed, y):
    X_train, X_val = X_processed.iloc[train_idx], X_processed.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    cv_scores.append(score)
    
    print(f"Fold {fold}: Accuracy = {score:.4f}")
    fold += 1

print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Train on Full Data and Generate Predictions

In [None]:
# Train on full training data
model.fit(X_processed, y)

# Generate predictions for test set
y_pred_test = model.predict(X_test_processed)

print(f"Test predictions shape: {y_pred_test.shape}")
print(f"Prediction distribution: {np.bincount(y_pred_test)}")

## Create Submission File

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_test
})

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")

# Verify submission format
print("\nVerifying submission format...")
print(f"Columns: {list(submission.columns)}")
print(f"Number of rows: {len(submission)}")
print(f"Expected rows: 418")
print(f"PassengerId range: {submission['PassengerId'].min()} to {submission['PassengerId'].max()}")