#### Data is here: https://www.kaggle.com/competitions/titanic/data

# Step 1: Load libraries and dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('../titanic/train.csv')
test  = pd.read_csv('../titanic/test.csv')

# Step 2: preprocess & feature-engineer

In [2]:
# fill missing, extract features
for df in (train, test):
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    
# one‐hot encode
cols = ['Pclass','Sex','Embarked','Title','FamilySize']
all_df = pd.get_dummies(pd.concat([train[cols], test[cols]]), drop_first=True)
X = all_df.iloc[:len(train)]
X_test = all_df.iloc[len(train):]
y = train['Survived']

# Step 3: split, train & evaluate

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
lgb = LGBMClassifier(verbose=-1, force_row_wise=True)
lgb.fit(X_train, y_train)
pred = lgb.predict(X_val)
print('Acc:', accuracy_score(y_val, pred), 'AUC:', roc_auc_score(y_val, lgb.predict_proba(X_val)[:,1]))

Acc: 0.7988826815642458 AUC: 0.8864221364221365


# Step 4: hyperparameter tuning for LightGBM

In [6]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.6, 0.8, 1.0]
}
rs_lgb = RandomizedSearchCV(
    LGBMClassifier(verbose=-1, force_row_wise=True),
    param_grid, n_iter=10, cv=3,
    scoring='roc_auc', random_state=42, n_jobs=-1
)
rs_lgb.fit(X, y)
print(rs_lgb.best_params_, rs_lgb.best_score_)

{'subsample': 0.6, 'num_leaves': 31, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1} 0.868133448375036


# Step 5: train final LightGBM & create submission

In [7]:
best_lgb = rs_lgb.best_estimator_
best_lgb.fit(X, y)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': best_lgb.predict(X_test)
})
submission.to_csv('../submission_lgb.csv', index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
