<h1 style="background-color:purple;
           font-family:segoe ui;
           color:black;
           font-size:250%;
           text-align:center;
           border-radius:10px 10px;">
         ☄️ Spaceship Titanic   🚀
    
<h1 style="background-color:black;
           font-family:segoe ui;
           color:white;
           font-size:150%;
           text-align:center;">         
          🌑Which passengers are transported to an alternate dimension🌑

 <div>
<div style="text-align: center;"> <img src="https://github.com/OfirMazor/Kaggle/blob/main/Spaceship%20Titanic/img/DALLE2%20-%20Spaceship%20Titanic2.png?raw=true" width="400" alt="DALLE2: 'Spaceship Titanic'"/>
</div>

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Libraries 📚

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score


import xgboost as xgb
import lightgbm as lgbm

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Load data 🗐

In [None]:
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data  = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
train_data.sample(3)

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Feature engineering and data cleaning⚙️

In [None]:
def features_engine(df           : pd.DataFrame,
                    train        : bool,
                    Cabin_column : str,
                    ID_Column    : str,
                    Name_Column  : str,
                    bools        : list,
                    categoricals : list,
                    money_cols   : list,
                    age_col      : str,
                    NaNs         : str):
    '''
    Full pipeline functions for processing a DataFrame.
    Return the input DataFrame with features ready for model training.
    '''
    
    num_rows = len(df)
    #########################################################################################################################
    
    def Bool2int(df = df, bools = bools):
        '''
        Converts Boolean columns to integers (0,1).
        '''
        for column in bools:
            try:
                df[column] = df[column].astype('Int32')
            except:
                df[column] = df[column].astype(bool)
                df[column] = df[column].astype('Int32')
                
        
    #########################################################################################################################

    def cabin_engine(df = df, Cabin_column = Cabin_column):
        '''
        Splits the Cabin description text to 3 different features,
        filling Na's values where necessary.
        '''
        # Split
        df[['CabinDeck','CabinNumber','CabinSide']] = df[Cabin_column].str.split('/', expand=True)
        
        # Type casting and missing values
        df[['CabinDeck', 'CabinSide']] = df[['CabinDeck','CabinSide']].astype(str)
        
        df['CabinNumber'] = df['CabinNumber'].fillna('99999') #The new CabinNumber column created first as string
        df['CabinNumber'] = df['CabinNumber'].astype(int)
        df['CabinNumber'] = df['CabinNumber'].replace(99999, np.nan)
        mf_imp = SimpleImputer(strategy="most_frequent")
        mf_imp.fit_transform(df[['CabinNumber']])
        
        
        df.drop(columns = Cabin_column, inplace=True)

    
    #########################################################################################################################

    def name_engine(df = df, Name_Column = Name_Column):
        '''
        Splits the full name text into 2 different features (FirstName, LastName).
        Get the lenght of each first and last name as 2 new columns (FirstName_len, LastName_len).
        Get the Family size (by assuming LastName is for each different family).
        Get a boolean info if the passenger is part of a family or solo.
        The original Name_Column will be dropped.
        '''
        
        # Get first and last names
        df[['FirstName','LastName']] = df[Name_Column].str.split(' ', expand=True)
        df.drop(columns = Name_Column, inplace = True)
        
        # Calculate the names length
        df['FirstName_len'] = df['FirstName'].str.len()
        df['LastName_len']  = df['LastName'].str.len()
        
        
        # Get family size before filling Na's (will be used next steps)
        family_sizes = pd.DataFrame(df['LastName'].value_counts(dropna=True).sort_index())
        family_sizes.rename(columns={'LastName' : 'FamilySize'}, inplace=True)
        family_sizes['LastName'] = family_sizes.index
        
        # After missing names were count as 0 for their lenght-
        # filling them with a temporary name "No Name".
        df['FirstName'] = df['FirstName'].fillna('No')
        df['LastName'] = df['LastName'].fillna('Name')
        
        # Back to those unnamed - replace their length column (=0) with mean length.
        df['FirstName_len'].replace(0, df['FirstName_len'].mean(), inplace=True)
        df['LastName_len'].replace(0, df['LastName_len'].mean(), inplace=True)
        
        
        # Get Family size as FamilySize column 
        df['FamilySize'] = df.merge(family_sizes, on = 'LastName', how='left')['FamilySize']
        df.loc[(df['FirstName'] == 'No')
                       & 
               (df['LastName'] == 'Name'), 'FamilySize'] = 0
        
        # Get family validate as IsFamily column (0: solo passenger, 1: passenger part of family)
        df['IsFamily'] = df['FamilySize'] > 1
        df['IsFamily'] = df['IsFamily'].astype(int)
        
        
    
    #########################################################################################################################

    def group_engine(df = df, ID_Column  = ID_Column):
        '''
        Adding 4 columns (IDGroup, NumberInGroup, GroupSize, 'IsGroup') from the input ID_Column
        '''
        # IDPrefix & NumberInGroup
        df[['IDGroup', 'NumberInGroup']] = df[ID_Column].str.split('_', expand=True)
        df[['IDGroup', 'NumberInGroup']] = df[['IDGroup', 'NumberInGroup']].astype(int)
        
        #GroupSize & IsGroup
        group_sizes = pd.DataFrame(df['IDGroup'].value_counts(dropna=False).sort_index())
        group_sizes.rename(columns={'IDGroup' : 'GroupSize'}, inplace=True)
        group_sizes['IDGroup'] = group_sizes.index
        
        df['GroupSize'] = df.merge(group_sizes, on = 'IDGroup', how='left')['GroupSize']
        
        df['IsGroup'] = df['GroupSize'] > 1
        df['IsGroup'] = df['IsGroup'].astype(int)
    
    
    #########################################################################################################################

    def age_engine(df = df, age_col = age_col):
        '''
        Adding a column ('AgeGroup') that cluster the ages into 4 different groups.
        '''
        df['Age'].fillna(df['Age'].mean(), inplace=True)
        
        criteria = [
                    df[age_col].between(df[age_col].min(), 10),   #Children
                    df[age_col].between(11,                20),   #Youth
                    df[age_col].between(21,                40),   #Adults
                    df[age_col].between(41,  df[age_col].max())   #Elders
                   ]
        
        groups   = ['Children', 'Youth', 'Adults', 'Elders']
        
        df['AgeGroup'] = np.select(criteria, groups, np.nan)

    #########################################################################################################################

    def payment_engine(df = df, money_cols = money_cols):
        '''
        Adding a summerized column (TotalBill) for all expenses for each passenger,
        Adding a mean column (AverageBill) for all expenses for each passenger,
        Adding a boolean column (IsBill) if there is a bill,
        adding a count column (CountBill) for all expenses for each passenger
        '''
        df['TotalBill'] = df[money_cols].sum(axis = 1)
        df['AverageBill'] = df[money_cols].mean(axis = 1)
        
        df['IsBill']    = df['TotalBill'] > 0
        df['IsBill']    = df['IsBill'].astype(int)
        
        df['CountBill'] = df[money_cols].replace(0, np.nan, inplace=False).count(axis=1, numeric_only=True)
                            
                        
    #########################################################################################################################

    def encode(df = df, categoricals = categoricals):
        '''
        Perform labels encoding for categorical columns.
        '''
        if 'AgeGroup' in df.columns:
            categoricals.append('AgeGroup')
        else:
            pass
        
        for category in categoricals:
            #df[category] = LabelEncoder().fit_transform(df[category])
            df[category] = OrdinalEncoder().fit_transform(df[[category]])
    
    #########################################################################################################################

    def Check_DuplicatedIDs(df, ID_column):
        '''
        Make sure there is no multiple information for the same passenger.
        '''
        duplicated = df[df.duplicated(ID_column, keep=False)].sort_values(ID_column)
        len_duplicated = len(duplicated)
        if len_duplicated == 0:
            print("No duplicated ID's were found")
        else:
            print(f"There are {len_duplicated} duplicated ID's")
        
    #########################################################################################################################

    def NaNsProcess(df = df, train = train, NaNs = NaNs):
        '''
        Processing all NaNs values in DataFrame.
        (Columns that already filled are ignored)
        If NaNs is 'drop':
        Drops all observations with NaN values if df if for training.
        If NaNs is 'fill':
        Fill Columns with sklearn Imputer.
        '''
        if NaNs == 'drop':
            if train:
                print(f'DataFrame obsevations before NAs dropping:{len(df)}.')
                df.dropna(inplace=True)
                print(f'DataFrame obsevations after NAs dropping:{len(df)}.')
            else:
                pass
        
        elif NaNs == 'fill':
            
            ###  CryoSleep   ###
            #Fill Na values as cryo sleepres (True) for passenger who didn't spent any money.
            df.loc[
                   (df['RoomService']  == 0)
                                & 
                   (df['FoodCourt']    == 0)
                                &
                   (df['ShoppingMall'] == 0)
                                &
                   (df['Spa']          == 0)
                                &
                   (df['VRDeck']       == 0)
                                &
                   (df['CryoSleep'].isna()), 'CryoSleep'] = 1 # True
            
            df['CryoSleep'] = df['CryoSleep'].fillna(0)       #False
            
            
            ###  Fill Na's with most frequent values   ###
            
            mf_imp   = SimpleImputer(strategy="most_frequent")
            
            most_freq_cols = ['HomePlanet', 'Destination', 'VIP', 'CabinNumber', 
                              'CabinDeck', 'CabinSide', 'AgeGroup', 'IsBill']
            
            for column in most_freq_cols:
                mf_imp.fit(df[[column]])
                df[column] = mf_imp.transform(df[[column]])
                
                
            ###  Fill Na's with mean values   ###
            mean_imp = SimpleImputer(strategy="mean")
                
            mean_cols      = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 
                              'FirstName_len', 'LastName_len', 'TotalBill', 'CountBill', 'AverageBill']
            
            for column in mean_cols:
                mean_imp.fit(df[[column]])
                df[column] = mean_imp.transform(df[[column]])
                
            
            
            is_na = df.isna().sum().sum() > 0
            if is_na:
                print("WARNING: DataFrame still has Na's values")
            else:
                pass
            
        else:
            print("NaNs parameter should be one of ('drop', 'fill')")
        
    #########################################################################################################################
    '''
    Execute the pipeline workflow by order
    '''
    Bool2int(df, bools)
    cabin_engine(df, Cabin_column)
    name_engine(df, Name_Column)
    group_engine(df, ID_Column)
    age_engine(df, age_col)
    payment_engine(df, money_cols)
    NaNsProcess(df)
    encode(df, categoricals)
    Check_DuplicatedIDs(df, ID_Column)
    
    
    lost = num_rows - len(df)
    if lost != 0: print(f'Warning : {lost} observations were dropped')
    else: 
        pass
    
              
    return  df

In [None]:
process_train_data = features_engine(df           = train_data,
                                     train        = True,
                                     Cabin_column = 'Cabin',
                                     ID_Column    = 'PassengerId',
                                     Name_Column  = 'Name',
                                     bools        = ['Transported', 'CryoSleep', 'VIP'],
                                     categoricals = ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'],
                                     money_cols   = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
                                     age_col      = 'Age',
                                     NaNs         = 'fill')




process_train_data = features_engine(df           = test_data,
                                     train        = False,
                                     Cabin_column = 'Cabin',
                                     ID_Column    = 'PassengerId',
                                     Name_Column  = 'Name',
                                     bools        = ['CryoSleep', 'VIP'],
                                     categoricals = ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'],
                                     money_cols   = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
                                     age_col      = 'Age',
                                     NaNs         = 'fill')

In [None]:
train_data.sample(4)

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Data Visualisations 📊

In [None]:
corr = abs(train_data.corr()['Transported']).sort_values().drop('Transported')
px.bar(corr, title='Absolute correlations of all features to Transported column')

In [None]:
Sample2Plot = train_data.sample(500)
#sns.pairplot(data = Sample2Plot, hue = 'Transported', corner = True);

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Model and Training🏋️

In [None]:
def TrainClassifier(train_df   : pd.DataFrame,
                    test_df    : pd.DataFrame,
                    scale      : bool,
                    test_size  : int or float,
                    features   : list,
                    target     : str,
                    classifier : str,
                    clf_params : dict,
                    seed       : int):

    '''
    Perform full training and testing steps of XBG classifier, ploting the metrics and returning submission file ready to submit.
    Parameters:
        - train_df   : DataFrame to train,
        - test_df    : DataFrame to validate model,
        - scale      : True if scale is wanted,
        - test_size  : The size of train_df to consider as test,
        - features   : List of desired feature to train the model,
        - target     : The name of the target column in train_df/test_df,
        - classifier : The name of the classifier model to use- XGB or LGBM,
        - clf_params : Dictionary containing the classifier parameters,
        - seed       : Random number seed
    '''
                                           #### Training Step ####
    
    # 0. Scale data 
    if scale:
        #Scaler = StandardScaler()
        Scaler = RobustScaler()
        #Scaler = MinMaxScaler()
        train_df[features] = Scaler.fit_transform(train_df[features])
        test_df[features]  = Scaler.transform(test_df[features])
    
    else:
        pass
    
    # 1. Split train_df  to trainig-testing
    X_train, X_test, y_train, y_test = train_test_split(train_df[features],
                                                        train_df[target].astype(int),
                                                        test_size    = test_size,
                                                        random_state = seed,
                                                        stratify     = train_df[target])

    
    
    # 2. Train a classifier and predict
    if classifier == 'XGB':
        model = xgb.XGBClassifier()
        
    elif classifier == 'LGBM':
        model = lgbm.LGBMClassifier(objective    = 'binary',
                                    random_state = seed,
                                    silent  = True,
                                    verbose = -1)
    else:
        print("classifier parameter should be 'XGB' or 'LGBM' ")
    
    
    RSCV = RandomizedSearchCV(estimator = model,
                              param_distributions = clf_params,
                              cv      = 7,
                              n_iter  = 20, 
                              scoring = 'accuracy')
    
    RSCV.fit(X_train, y_train)
    
    best_params = RSCV.best_params_
    
    if classifier == 'XGB':
        best_model = xgb.XGBClassifier(**best_params)
    elif classifier == 'LGBM':
        best_model = lgbm.LGBMClassifier(**best_params)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    
    print('F1 score:', f1_score(list(y_test.values), y_pred))
    print('\n')
    print('Accuracy score:', accuracy_score(list(y_test.values), y_pred))
    
    #########################################################################################################################
    
    
                                      #### Testing Step ####
    
    # 1. Predict validation data
    test_df[target] = best_model.predict(test_df[features]).astype(bool)
    
    # 2. Submission format variable
    submission = test_df[['PassengerId', target]]
    
    #########################################################################################################################

    
                                      #### Review and Metrics Step ####
    
    # 1.Review classifier parameters
    try:
        display(pd.DataFrame(best_model.get_xgb_params(), index=[0]))
    except:
        display(pd.DataFrame(best_model.get_params(), index=[0]))
        
    
    
    # 2.Review classifier features by importance
    features_importance = pd.DataFrame(data = best_model.feature_importances_).T
    features_importance.columns = features
    px.bar(features_importance.T, title = 'Features Importance for Best Model').show()
    
    
    # 3.Plot confusion metrix
    cm = confusion_matrix(y_true    = list(y_test.values),
                          y_pred    = y_pred,
                          normalize = 'true')
    
    cm_Display = ConfusionMatrixDisplay(confusion_matrix = cm,
                                        display_labels   = [False, True])
    cm_Display.plot()
    cm_Display.ax_.set_title("Confusion Matrix");
    
    # 4. Plot value counts of prediction
    px.bar(submission[target].value_counts(dropna=False),
           title = f'Value Counts of {target} in Submission').show()
    
    
    return submission

In [None]:
features = [
            #'Age',
            'AgeGroup',
            'AverageBill',
            'CabinDeck',
            'CabinNumber',
            'CabinSide',
            'CountBill',
            'CryoSleep',
            'Destination',
            'FamilySize',
            #'FirstName',
            'FirstName_len',
            'FoodCourt',
            'GroupSize',
            'HomePlanet',
            'IDGroup',
            'IsBill',
            'IsFamily',
            'IsGroup',
            #'LastName',
            'LastName_len',
            'NumberInGroup',
            #'PassengerId',
            'RoomService',
            'ShoppingMall',
            'Spa',
            'TotalBill',
            #'VIP',
            'VRDeck'
           ]



xgbClassifier_params = {'booster'         : ['gbtree','gblinear'],
                        'learning_rate'   : [0.2, 0.1, 0.05], 
                        'max_depth'       : [3, 4, 5, 10],
                        'min_child_weight': [10, 15, 20],
                        'n_estimators'    : [50, 300, 600],
                        'eval_metric'     : ['auc', 'mlogloss']}            
#https://xgboost.readthedocs.io/en/stable/parameter.html


lgbmClassifier_params = {'boosting_type' : ['gbdt', 'dart'],
                         'num_leaves'    : [10, 15, 20], 
                         'max_depth'     : [3, 4, 5, 10],
                         'learning_rate' : [0.2, 0.1, 0.05],
                         'n_estimators'  : [50, 300, 600],
                         'metric'        : ['auc','binary_logloss']}             
#https://lightgbm.readthedocs.io/en/v3.3.2/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier



submission = TrainClassifier(train_df  = train_data,
                            test_df    = test_data,
                            scale      = True,
                            test_size  = 0.2,
                            features   = features,
                            target     = 'Transported',
                            classifier = 'LGBM',                 # 'XGB'
                            clf_params = lgbmClassifier_params,  # xgbClassifier_params
                            seed       = 100)

* 
<h1 style="text-align:Left; font-size:150%; color:#1c6ce6;">
Submit Results 🎯

In [None]:
submission.to_csv('submission.csv', index=False)

<div>
<div style="text-align: center;"> <img src="https://github.com/OfirMazor/Kaggle/blob/main/Spaceship%20Titanic/img/DALLE2%20-%20Spaceship%20Titanic1.png?raw=true" width="400" alt="DALLE2: 'Spaceship Titanic'"/>
</div>

Spaceships images generated with DALLE2