In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import VotingClassifier
path_to_train = "/kaggle/input/spaceship-titanic/train.csv"
path_to_validat = "/kaggle/input/spaceship-titanic/test.csv"

In [2]:
df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_validat)

np.random.seed(42)
random_state = 42

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Preprocessing

To improve readability, a single master method will be divided into one method that calls seven others. Processing with methods helps avoid duplicating the entire code for both the train and validation DataFrames.

## Group transformation

We spliting `PassengerId` to get Group of each passanger

In [5]:
def group_transform(X):
  X['Group'] = X['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
  X=X.set_index('PassengerId')
  X['Group'] = X['Group'].astype(int)
  return X

##Home and Destinaton transformation

First fill `HomePlanet` and `Destination` with the first not NaN value of each `Group`. Second fill we remove NaN in exception ones with mode

In [6]:
def home_n_destin_transform(X):
  X['HomePlanet'] = X['HomePlanet'].fillna(X.groupby('Group')['HomePlanet'].transform('first'))
  X['Destination'] = X['Destination'].fillna(X.groupby('Group')['Destination'].transform('first'))
  X['HomePlanet'] = X['HomePlanet'].fillna(X['HomePlanet'].mode()[0])
  X['Destination'] = X['Destination'].fillna(X['Destination'].mode()[0])
  # One-hot encode the `HomePlanet` column
  X = pd.get_dummies(X, columns=['HomePlanet'], drop_first=False)
  X = pd.get_dummies(X, columns=['Destination'], drop_first=False)
  return X

## Deck and Side transformaion

We split `Cabin` in 3 new columns. `Num` is not informative one.

Filling `Deck` and `Side` with random.choise based on percentage frequency.
Reordering `Deck` categories that can help some machine learning models understand the data better

In [7]:
def deck_n_side_transform(X):
  X[['Deck', 'Num', 'Side']] = X['Cabin'].str.split('/', expand=True)
  X = X.drop('Num', axis=1)
  for column in ['Deck', 'Side']:
    train_value_counts = X[column].value_counts(normalize=True)
    X[column] = X[column].fillna(np.random.choice(train_value_counts.index, p=train_value_counts.values))

  X['Deck'] = X['Deck'].astype('category')
  X['Deck'] = X['Deck'].cat.reorder_categories(X['Deck'].value_counts().index.to_list(), ordered=True)
  X['Deck'] = X['Deck'].cat.codes
  # One-hot encode
  X = pd.get_dummies(X, columns=['Side'], drop_first=False)
  return X

## CryoSleep transformaion

Saving the proportions of `CryoSleep` in each one `Deck`, we fill NaN in `CryoSleep`

In [8]:
def cryosleep_transform(X):
  for deck in X['Deck'].unique():
      subset = X[X['Deck'] == deck]
      if subset['CryoSleep'].isnull().sum() > 0:
          value_counts = subset['CryoSleep'].value_counts(normalize=True)
          X.loc[X['Deck'] == deck, 'CryoSleep'] = X.loc[X['Deck'] == deck, 'CryoSleep'].fillna(np.random.choice(value_counts.index, p=value_counts.values))
  # One-hot encode
  X = pd.get_dummies(X, columns=['CryoSleep'], drop_first=False)

  return X

## VIP transformaion

Saving the proportions of `VIP` in each one `Deck`, we fill NaN in `VIP`

In [9]:
def vip_transform(X):
  for deck in X['Deck'].unique():
      subset = X[X['Deck'] == deck]
      if subset['VIP'].isnull().sum() > 0:
          value_counts = subset['VIP'].value_counts(normalize=True)
          X.loc[X['Deck'] == deck, 'VIP'] = X.loc[X['Deck'] == deck, 'VIP'].fillna(np.random.choice(value_counts.index, p=value_counts.values))

  # One-hot encode the VIP column
  X = pd.get_dummies(X, columns=['VIP'], drop_first=False)
  return X

## Age transformaion

Just fill `Age` with median

In [10]:
def age_transform(X):
  X['Age'] = X['Age'].fillna(X['Age'].median())
  return X

## TotalMoney transformaion

Fill `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` with median. If passanger is in `CryoSleep` he cant spend money. Sum all spended money in new column `TotalMoney`

In [11]:
def totalmoney_transform(X):
  for col in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    X[col] = X[col].fillna(X[col].median())
    X.loc[X['CryoSleep_True'] == 1, col] = 0

  X['TotalMoney'] = X['RoomService'] + X['FoodCourt'] + X['ShoppingMall'] + X['Spa'] + X['VRDeck']
  return X

## Master method

Method that use all previous, and some final preparation of data

In [12]:
def new_columns_n_fill(X):
  X=group_transform(X)
  X=home_n_destin_transform(X)
  X=deck_n_side_transform(X)
  X=cryosleep_transform(X)
  X=vip_transform(X)
  X=age_transform(X)
  X=totalmoney_transform(X)
  # drop not needed columns
  X = X.drop(['Cabin', 'Name'], axis=1)
  # change type of columns that in bool to int
  for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype(int)

  return X

Call of the master method for each df

In [13]:
df = new_columns_n_fill(df)
valid_df = new_columns_n_fill(valid_df)

In [14]:
df.shape

(8693, 22)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   Transported                8693 non-null   int64  
 7   Group                      8693 non-null   int64  
 8   HomePlanet_Earth           8693 non-null   int64  
 9   HomePlanet_Europa          8693 non-null   int64  
 10  HomePlanet_Mars            8693 non-null   int64  
 11  Destination_55 Cancri e    8693 non-null   int64  
 12  Destination_PSO J318.5-22  8693 non-null   int64  
 13  Destination_TRAPPIST-1e    8693 non-null   i

# Premodel prepare

Spliting out DataFrame to further use in models, scaling all data (validation also)

In [16]:
X, y = df.drop('Transported', axis=1), df['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
val_scaled = scaler.transform(valid_df)

# Models

## MLPClassifier

In [17]:
mlp_params = {
    'activation': 'relu',
    'solver': 'adam',
    'early_stopping': True,
}

mlp = MLPClassifier(**mlp_params)

mlp_params_tuned = {
    'hidden_layer_sizes': [(10,),(20,),(50,)],
    'alpha': [0.01, 0.1],
    'learning_rate_init': [0.0001, 0.0005, 0.001],
    'max_iter': [200, 500],
}

mlp_randomized = RandomizedSearchCV(mlp, mlp_params_tuned, n_iter=10, cv=5, verbose=1, random_state=random_state, n_jobs=-1)
mlp_randomized.fit(X_train_scaled, y_train)

print(f"Best parameters: {mlp_randomized.best_params_}")
print(f"Best score: {mlp_randomized.best_score_}")

# Validate the model using the best parameters
best_mlp = mlp_randomized.best_estimator_
y_pred = best_mlp.predict(X_test_scaled)
print(f"Accuracy for best MLP model: {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'max_iter': 500, 'learning_rate_init': 0.001, 'hidden_layer_sizes': (50,), 'alpha': 0.1}
Best score: 0.8010994174027438
Accuracy for best MLP model: 0.7701149425287356


## XGBoost

In [18]:
xgb_params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'max_depth': 7,
    'colsample_bytree': 1.0,
    'n_jobs': -1,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'verbosity': 0,
    'random_state': random_state,
}

xgb_model = XGBClassifier(**xgb_params)

xgb_params_tuned = {
    'n_estimators': np.arange(250, 350, 50),
    'max_depth': np.arange(12, 14, 1),
    'learning_rate': np.arange(0.01, 0.03,0.004),
    'subsample': np.arange(0.2, 0.4, 0.1),
    'colsample_bytree': np.arange(0.2, 0.4, 0.1),
}

xgb_randomized = RandomizedSearchCV(xgb_model, xgb_params_tuned, cv=10, n_jobs=-1, verbose=1)

xgb_randomized.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", xgb_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(xgb_randomized.best_score_))

# Validate the model using the best parameters
best_xgb = xgb_randomized.best_estimator_
y_pred = best_xgb.predict(X_test_scaled)
print(f"Accuracy for best XGBoost model: {accuracy_score(y_test, y_pred)}")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best hyperparameters:  {'subsample': 0.30000000000000004, 'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.026000000000000002, 'colsample_bytree': 0.30000000000000004}
Best mean cross-validation score: 0.806
Accuracy for best XGBoost model: 0.7839080459770115


## CatBoost

In [19]:
cat_params = {
    'depth':  np.arange(11, 14,1),
    'learning_rate': np.arange(0.001, 0.02,0.002),
    'l2_leaf_reg': [0.1],
    'random_strength': [0.2],
    'max_bin': [200],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
    'one_hot_max_size': [50],
    'iterations': np.arange(200, 500, 100),
}

cat_model = CatBoostClassifier(eval_metric='Logloss', verbose=0, random_state=random_state, grow_policy='Lossguide')

cat_randomized = RandomizedSearchCV(cat_model, cat_params, cv=5, n_jobs=-1, verbose=1,random_state=random_state)

cat_randomized.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", cat_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(cat_randomized.best_score_))

# Validate the model using the best parameters
best_cat = cat_randomized.best_estimator_
y_pred = best_cat.predict(X_test_scaled)
print(f"Accuracy for best CatBoost model: {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters:  {'random_strength': 0.2, 'one_hot_max_size': 50, 'max_bin': 200, 'learning_rate': 0.017, 'l2_leaf_reg': 0.1, 'iterations': 400, 'depth': 11, 'bootstrap_type': 'Bernoulli'}
Best mean cross-validation score: 0.806
Accuracy for best CatBoost model: 0.7770114942528735


## GradientBoost

In [20]:
gb_params = {
    'n_estimators': np.arange(500, 800, 100),
    'learning_rate': np.logspace(-4, 0, num=100),
    'max_depth': np.arange(7, 11),
    'min_samples_split': np.arange(7, 12),
    'min_samples_leaf': [1],
    'max_features': ['sqrt', 'log2']
}

gb_model = GradientBoostingClassifier(random_state=random_state)

gb_randomized = RandomizedSearchCV(gb_model, gb_params, cv=5, n_jobs=-1, verbose=1)
gb_randomized.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", gb_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(gb_randomized.best_score_))

# Validate the model using the best parameters
best_gb = gb_randomized.best_estimator_
y_pred = best_gb.predict(X_test_scaled)
print(f"Accuracy for best GradientBoosting model: {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters:  {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 9, 'learning_rate': 0.02915053062825179}
Best mean cross-validation score: 0.802
Accuracy for best GradientBoosting model: 0.7896551724137931


## DecisionTree

In [21]:
dt_params = {
    'max_depth': np.arange(15, 20),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(5, 10),
    'max_features': ['sqrt', 'log2', None]
}

dt_model = DecisionTreeClassifier(random_state=random_state)

dt_randomized = RandomizedSearchCV(dt_model, dt_params, cv=5, n_jobs=-1, verbose=1)
dt_randomized.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", dt_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(dt_randomized.best_score_))

# Validate the model using the best parameters
best_dt = dt_randomized.best_estimator_
y_pred = best_dt.predict(X_test_scaled)
print(f"Accuracy for best DecisionTree model: {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters:  {'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 18}
Best mean cross-validation score: 0.773
Accuracy for best DecisionTree model: 0.764367816091954


## RandomForest

In [22]:
rf_params = {
    'n_estimators': np.arange(500, 800, 100),
    'max_depth': np.arange(15, 20),
    'min_samples_split': np.arange(7, 10),
    'min_samples_leaf': np.arange(5, 8),
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestClassifier(random_state=random_state)

rf_randomized = RandomizedSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=1)
rf_randomized.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", rf_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(rf_randomized.best_score_))

# Validate the model using the best parameters
best_rf = rf_randomized.best_estimator_
y_pred = best_rf.predict(X_test_scaled)
print(f"Accuracy for best RandomForest model: {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters:  {'n_estimators': 600, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 17}
Best mean cross-validation score: 0.803
Accuracy for best RandomForest model: 0.7758620689655172


# Voting

In [23]:
voting_clf = VotingClassifier(estimators=[
    ('mlp', best_mlp),
    ('xgb', best_xgb),
    ('cat', best_cat),
    ('gb', best_gb),
    ('dt', best_dt),
    ('rf', best_rf)
], voting='soft')

voting_clf.fit(X_train_scaled, y_train)
y_pred = voting_clf.predict(X_test_scaled)
print(f"Accuracy for best ensemble model with hard voting: {accuracy_score(y_test, y_pred)}")

Accuracy for best ensemble model with hard voting: 0.7850574712643679


# Output

In [24]:
valid_df['Transported'] = voting_clf.predict(val_scaled)

In [25]:
valid_df = valid_df.reset_index()

In [26]:
output_valid_df=valid_df[['PassengerId','Transported']]

In [27]:
output_valid_df['Transported'] = output_valid_df['Transported'].replace({0: 'False', 1: 'True'})

In [28]:
output_valid_df.to_csv('Prod_1.csv', index=False)