In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
import joblib

# > **#DATA PREPROCESSING**

In [2]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [3]:
train = train.drop(["Name"],axis=1)
train[['Class', 'Row', 'Seat']] = train['Cabin'].str.split('/', expand=True)
test[['Class', 'Row', 'Seat']] = test['Cabin'].str.split('/', expand=True)

In [4]:
cols_to_encode = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP','Class', 'Row', 'Seat']

# inisialisasi objek LabelEncoder
le = LabelEncoder()
for col in cols_to_encode:
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.fit_transform(test[col].astype(str))

In [5]:
train.dropna(axis=0, inplace=True)

In [6]:
medianRS = test['RoomService'].median()
medianFC = test['FoodCourt'].median()
medianSM = test['ShoppingMall'].median()
medianSP = test['Spa'].median()
medianVD = test['VRDeck'].median()
medianAE = test['Age'].median()
test['RoomService'].fillna(medianRS, inplace=True)
test['FoodCourt'].fillna(medianFC, inplace=True)
test['ShoppingMall'].fillna(medianSM, inplace=True)
test['Spa'].fillna(medianSP, inplace=True)
test['VRDeck'].fillna(medianVD, inplace=True)
test['Age'].fillna(medianAE, inplace=True)
test['HomePlanet'].fillna('0', inplace=True)
test['CryoSleep'].fillna('0', inplace=True)
test['Cabin'].fillna('0', inplace=True)
test['Destination'].fillna('0', inplace=True)
test['VIP'].fillna('0', inplace=True)
test.dropna(axis=1, inplace=True)

# **Modelling**

In [7]:
X = train.drop(["Transported","PassengerId"],axis=1)
y = train["Transported"]

In [8]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
import optuna
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    # Train the random forest classifier with the chosen hyperparameters
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                 min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                 max_features=max_features, random_state=42)
    clf.fit(X_train, y_train)
    # Calculate the F1 score on the test set
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    return f1

if __name__ == "__main__":
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)
    print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))


[32m[I 2023-04-02 14:25:11,171][0m A new study created in memory with name: no-name-6da24813-444a-4c1a-82ad-3f38c3d97a0f[0m
[32m[I 2023-04-02 14:25:12,230][0m Trial 0 finished with value: 0.7962340282447882 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7962340282447882.[0m
[32m[I 2023-04-02 14:25:13,614][0m Trial 1 finished with value: 0.7729468599033816 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.7962340282447882.[0m
[32m[I 2023-04-02 14:25:19,764][0m Trial 2 finished with value: 0.8308489954633831 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.8308489954633831.[0m
[32m[I 2023-04-02 14:25:22,276][0m Trial 3 finished with value: 0.830213040671400

Best trial: score 0.8419705694177863, params {'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'log2'}


In [10]:
##param = Best trial: score 0.8408215661103979,
params = {'n_estimators': 1000, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 'sqrt'}

In [11]:
rf = RandomForestClassifier(**params)
model = rf.fit(X_train,y_train)
x_pred =  model.predict(X_test)
print(classification_report(x_pred,y_test))
joblib.dump(model, 'model.joblib')

              precision    recall  f1-score   support

       False       0.82      0.85      0.83       730
        True       0.85      0.82      0.84       794

    accuracy                           0.83      1524
   macro avg       0.83      0.84      0.83      1524
weighted avg       0.84      0.83      0.83      1524



['model.joblib']

In [12]:
loaded_model = joblib.load('model.joblib')
y_pred = loaded_model.predict(test.drop(["PassengerId"],axis=1))
results = pd.DataFrame({"PassengerId" : test["PassengerId"],
                        "Transported" : y_pred})
results

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [13]:
results['Transported'] = results['Transported'].replace({1: True, 0: False})
results.to_csv("Prediction_result.csv", index=False)

# **Sumarry**
# after several trials it can be concluded that there are differences in data distribution between the test data and the train data (not similar). good modeling results will not affect the level of success in predicting test data.