In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [32]:
import re

def preprocess_data(dataset):
    # Remove quotes from the 'name' column
    dataset['Name'] = dataset['Name'].apply(lambda x: re.sub(r'"', '', x))

    # Extract and apply name prefixes
    def extract_name_prefix(name):
        match = re.search(r'^(?:.*,)?(?:\s*)?(.*?\.)', name)
        return match.group(1) if match else ''
    dataset['Name_prefix'] = dataset['Name'].apply(extract_name_prefix)

    # Extract ticket type and encode it
    dataset['Ticket_type'] = dataset['Ticket'].apply(lambda x: x[0:3])
    dataset['Ticket_type'] = dataset['Ticket_type'].astype('category').cat.codes

    # Standardize titles in the 'title' column
    title_replacements = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'}
    dataset['Name_prefix'] = dataset['Name_prefix'].replace(title_replacements)

    # Handling the 'Age' column by filling missing values with random ages based on the mean and std of the dataset
    mean = dataset['Age'].mean()
    std = dataset['Age'].std()
    is_null = dataset['Age'].isnull().sum()
    rand_age = np.random.randint(max(0, mean - std), mean + std, size=is_null)  # Ensure lower bound is non-negative
    age_slice = dataset['Age'].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset['Age'] = age_slice
    dataset['Age'] = dataset['Age'].astype(int)

    # Fill missing 'Embarked' values with the mode
    embarked_mode = dataset['Embarked'].mode()[0]  # Safely extract the mode
    dataset['Embarked'] = dataset['Embarked'].fillna(embarked_mode)

    # Calculate relatives and define 'travelled_alone'
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset['travelled_alone'] = 'No'
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'
    
    return dataset

In [33]:
df_pre = preprocess_data(df)

In [34]:
df_pre.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_prefix,Ticket_type,relatives,travelled_alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,Mr.,124,1,No
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,Mrs.,137,1,No
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,Miss.,148,0,Yes
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,Mrs.,3,1,No
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,Mr.,97,0,Yes
5,6,0,3,"Moran, Mr. James",male,33,0,0,330877,8.4583,,Q,Mr.,72,0,Yes
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,Mr.,18,0,Yes
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S,Master.,83,4,No
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S,Mrs.,81,2,No
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C,Mrs.,38,1,No


In [42]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import optuna
import sklearn.metrics

In [36]:
df_pre.columns.unique()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Name_prefix',
       'Ticket_type', 'relatives', 'travelled_alone'],
      dtype='object')

In [64]:
def objective(trial):
    # Assume df_pre is a predefined DataFrame you've prepared elsewhere
    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = pd.get_dummies(df_pre[features])
    y = df_pre["Survived"]

    # Split the dataset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert to DMatrix object, which is a requirement for xgboost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    
    # Parameter space definition
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "error",
        "tree_method": "hist",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "n_estimators": 100,  # Consider setting it to a higher value
        "early_stopping_rounds": 30,
        "verbose" : 1,
    }

    # Model training
    bst = xgb.train(param, dtrain)
    
    # Model prediction
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    
    # Calculate accuracy
    accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)
    return accuracy

In [65]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

[I 2024-07-18 03:16:16,699] A new study created in memory with name: no-name-cc9c3625-e8fe-425f-993d-0bb114bc7835
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
[I 2024-07-18 03:16:16,730] Trial 0 finished with value: 0.7877094972067039 and parameters: {'booster': 'gbtree', 'lambda': 0.0003867697338185661, 'alpha': 0.005336550225164682, 'subsample': 0.21885674792438028, 'colsample_bytree': 0.5432305559883553}. Best is trial 0 with value: 0.7877094972067039.
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
[I 2024-07-18 03:16:16,751] Trial 1 finished with value: 0.7877094972067039 and parameters: {'booster': 'gblinear', 'lambda': 0.003069450316367369, 'alpha': 2.4978290305913375e-07, 'subsample': 0.9913889468841937, 'colsample_bytree': 0.6639846918687005}. Best is trial 0 with value: 0.7877094972067039.
  "lambda": trial.suggest_loguniform("lambda", 1e-

In [66]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value: 0.7988826815642458
  Params: 
    booster: gbtree
    lambda: 0.00322384564221531
    alpha: 0.01333623659658508
    subsample: 0.7916822270700862
    colsample_bytree: 0.5543856106987577


In [67]:
best_params = study.best_trial.params

In [68]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(df_pre[features])
y = df_pre["Survived"]

# Split the dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)

In [69]:
final_model = xgb.train(best_params, dtrain)

In [70]:
final_preds = final_model.predict(dvalid)
final_pred_labels = np.rint(final_preds)
accuracy = sklearn.metrics.accuracy_score(y_test, final_pred_labels)
print("Final Model Accuracy: {}".format(accuracy))

Final Model Accuracy: 0.7988826815642458


In [71]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

df_test_pre = preprocess_data(df_test )

In [72]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X_test_pre = pd.get_dummies(df_test_pre[features])

In [73]:
dtest = xgb.DMatrix(X_test_pre)
final_preds = final_model.predict(dtest)
final_pred_labels = np.rint(final_preds).astype(int)

In [74]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": final_pred_labels
})

# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)