In [214]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb


In [176]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

In [186]:
def extract_title_and_family_name(df, name_column='Name'):
    # Function to extract the title from the name
    def extract_title(name):
        # Titles are typically followed by a period, like "Mr.", "Mrs.", "Miss."
        return name.split(',')[1].split('.')[0].strip()

    # Function to extract the family name
    def extract_family_name(name):
        # Family name is usually the first part of the name before the comma
        return name.split(',')[0].strip()

    # Apply the functions to the DataFrame
    df['Title'] = df[name_column].apply(extract_title)
    df['Family Name'] = df[name_column].apply(extract_family_name)
    df['Family Name group size'] = df.groupby('Family Name')['Family Name'].transform('count')

    
    df['Royal_Male_Title'] = df['Title'].apply(lambda x: 1 if x in ['Don', 'Sir', 'Jonkheer'] else 0)
    df['Royal_Female_Title'] = df['Title'].apply(lambda x: 1 if x in ['Dona', 'the Countess', 'Lady'] else 0)
    df['Crew_Title'] = df['Title'].apply(lambda x: 1 if x in ['Capt', 'Col', 'Major'] else 0)
    df['Professional_Title'] = df['Title'].apply(lambda x: 1 if x in ['Dr', 'Rev'] else 0)
    df['Master_Title'] = df['Title'].apply(lambda x: 1 if x == 'Master' else 0)
    df['Miss_Title'] = df['Title'].apply(lambda x: 1 if x in ['Miss', 'Mlle', 'Mme', 'Ms'] else 0)
    df['Mr_Title'] = df['Title'].apply(lambda x: 1 if x == 'Mr' else 0)
    df['Mrs_Title'] = df['Title'].apply(lambda x: 1 if x in ['Mrs'] else 0)

    return df.drop([name_column,'Family Name'], axis=1)


In [155]:
def extract_deck(df, deck_column = 'Cabin'):
    # Function to extract the deck from the cabin number
    def extract_deck(cabin):
        return cabin[0] if pd.notnull(cabin) else 'Unknown'

    # Apply the function to the DataFrame
    df['Deck'] = df[deck_column].apply(extract_deck)

    
    return df.drop(deck_column, axis=1)

In [156]:
def extract_family_size_and_alone(df):
    # Function to extract the family size
    df['Family Size'] = df['SibSp'] + df['Parch'] + 1

    # Function to determine if the passenger is alone
    df['Alone'] = (df['Family Size'] == 1).astype(int)

    return df

In [157]:
def ticket_group_size(df, ticket_column = 'Ticket'):
    # Function to extract the ticket group size
    ticket_group_size = df[ticket_column].value_counts()
    df['Ticket Group Size'] = df[ticket_column].apply(lambda x: ticket_group_size[x])

    
    return df.drop(ticket_column, axis=1)

In [196]:
def handle_age(df):
    df['Age_missing'] = df['Age'].isna().astype(int)
    df['Age'].fillna(df['Age'].median(), inplace=True)

    return df.drop('Title', axis=1)

In [209]:
def preprocessing(train, test):
    train.set_index('PassengerId', inplace=True)
    test.set_index('PassengerId', inplace=True)
    train = extract_title_and_family_name(train)
    test = extract_title_and_family_name(test)

    train = extract_deck(train)
    test = extract_deck(test)

    train = extract_family_size_and_alone(train)
    test = extract_family_size_and_alone(test)

    train = ticket_group_size(train)   
    test = ticket_group_size(test)

    train = handle_age(train)
    test = handle_age(test)
    
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    test['Deck_T'] = 0

    

    return train, test

In [210]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
train_df, test_df = preprocessing(train, test)

In [211]:
train, val = train_test_split(train_df, test_size=0.2, random_state=42)

X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_val = val.drop('Survived', axis=1)
y_val = val['Survived']
X_test = test_df.drop('Survived', axis=1)
y_test = test_df['Survived']

In [212]:
X_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family Name group size',
       'Royal_Male_Title', 'Royal_Female_Title', 'Crew_Title',
       'Professional_Title', 'Master_Title', 'Miss_Title', 'Mr_Title',
       'Mrs_Title', 'Family Size', 'Alone', 'Ticket Group Size', 'Age_missing',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
       'Deck_T', 'Deck_Unknown'],
      dtype='object')

In [240]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import numpy as np

# Adjust the parameter grid with reasonable ranges
param_dist = {
    'n_estimators': np.arange(100, 501, 10),  # 100 to 500, step 50
    'max_depth': np.arange(3, 11, 1),  # depths from 3 to 10
    'learning_rate': np.linspace(0.01, 0.3, 5),  # 10 values between 0.01 and 0.3
    'subsample': np.linspace(0.6, 1.0, 3),  # 5 values between 0.6 and 1.0
    'colsample_bytree': np.linspace(0.6, 1.0, 3),  # 5 values between 0.6 and 1.0
    'gamma': np.linspace(0, 0.5, 3),  # 5 values between 0 and 0.5
    'alpha': np.logspace(-2, 2, 3),  # 5 values between 0.01 and 100 (L1 regularization)
    'reg_lambda': np.logspace(-2, 2, 3),  # 5 values between 0.01 and 100 (L2 regularization)
    'min_child_weight': np.arange(1, 6, 1),  # values from 1 to 5
    'scale_pos_weight': np.linspace(1, 5, 3),  # values between 1 and 5 for handling class imbalance
    'colsample_bylevel': np.linspace(0.6, 1.0, 3),  # additional regularization parameter
    'colsample_bynode': np.linspace(0.6, 1.0, 3),  # regularization on feature sampling
    'max_delta_step': np.arange(0, 6, 2),  # control the maximum delta step
}

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set up the RandomizedSearchCV with fewer iterations to avoid overflow
random_search = RandomizedSearchCV(
    xgb_clf, 
    param_distributions=param_dist, 
    n_iter=2000,  # reduced number of iterations
    scoring='accuracy', 
    n_jobs=-1, 
    cv=5,  # 5-fold cross-validation
    verbose=2, 
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)


Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 1.0, 'reg_lambda': 1.0, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 4, 'max_delta_step': 4, 'learning_rate': 0.08249999999999999, 'gamma': 0.0, 'colsample_bytree': 0.6, 'colsample_bynode': 0.6, 'colsample_bylevel': 1.0, 'alpha': 0.01}
Best CV Score: 0.841327686398109


In [239]:
# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, subsample=0.8, scale_pos_weight=1.0, reg_lambda=1, n_estimators=150, min_child_weight=1, max_depth=4, max_delta_step=4, learning_rate=0.0825, gamma=0.0, colsample_bytree=0.6, colsample_bynode=0.6, colsample_bylevel=1, alpha=0.01)

# Fit the classifier to the training set
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_val)
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.83      0.83      0.83       179



In [None]:
y_test_pred = xgb_clf.predict(X_test)