In [38]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [39]:
%%time
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, zipfile, xgboost as xgb, optuna
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

CPU times: user 34 µs, sys: 1 µs, total: 35 µs
Wall time: 38.9 µs


In [40]:
data_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
data_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
data_train.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [41]:
data_test.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers


In [42]:
print(f"{data_train.shape}\n{data_test.shape}")

(8693, 14)
(4277, 13)


In [43]:
strange_data = ['HomePlanet', 'Destination']
for data in strange_data:
    print(data_train[data].value_counts())

HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64


In [44]:
data_train.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)
data_test.drop(['Name', 'Cabin'], axis=1, inplace=True)

In [45]:
data_train = pd.get_dummies(data_train, columns=['HomePlanet', 'Destination'])
data_test = pd.get_dummies(data_test, columns=['HomePlanet', 'Destination'])
data_train.head(2)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,False,True
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,True,False,False,False,False,True


In [46]:
for col in data_train.columns.to_list():
    print(f"{col} : {data_train[col].isna().sum()}")

CryoSleep : 217
Age : 179
VIP : 203
RoomService : 181
FoodCourt : 183
ShoppingMall : 208
Spa : 183
VRDeck : 188
Transported : 0
HomePlanet_Earth : 0
HomePlanet_Europa : 0
HomePlanet_Mars : 0
Destination_55 Cancri e : 0
Destination_PSO J318.5-22 : 0
Destination_TRAPPIST-1e : 0


In [47]:
isna_columns_float = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in isna_columns_float:
    data_train[col] = data_train[col].fillna(data_train[col].mean())

for col in isna_columns_float:
    data_test[col] = data_test[col].fillna(data_test[col].mean())

In [48]:
data_train = data_train.dropna(subset=['VIP', 'CryoSleep'])

prob_true_vip = data_test['VIP'].mean()  
prob_true_cs = data_test['CryoSleep'].mean()  

data_test['VIP'] = data_test['VIP'].fillna(pd.Series(np.random.rand(len(data_test)) < prob_true_vip)).infer_objects(copy=False).astype(bool)
data_test['CryoSleep'] = data_test['CryoSleep'].fillna(pd.Series(np.random.rand(len(data_test)) < prob_true_cs)).infer_objects(copy=False).astype(bool)



  data_test['VIP'] = data_test['VIP'].fillna(pd.Series(np.random.rand(len(data_test)) < prob_true_vip)).infer_objects(copy=False).astype(bool)
  data_test['CryoSleep'] = data_test['CryoSleep'].fillna(pd.Series(np.random.rand(len(data_test)) < prob_true_cs)).infer_objects(copy=False).astype(bool)


In [49]:
for col in data_train.columns.to_list():
    if 'HomePlanet_' in col or 'Destination_' in col:
        data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)

for col in data_test.columns.to_list():
    if 'HomePlanet_' in col or 'Destination_' in col:
        data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)

data_train.CryoSleep = data_train.CryoSleep.replace({False : 0, True : 1}).infer_objects(copy=False).astype(int)
data_train.VIP = data_train.VIP.replace({False : 0, True : 1}).infer_objects(copy=False).astype(int)
data_train.Transported = data_train.Transported.replace({False : 0, True : 1}).astype(int)

data_test.CryoSleep = data_test.CryoSleep.replace({False : 0, True : 1}).astype(int)
data_test.VIP = data_test.VIP.replace({False : 0, True : 1}).astype(int)

data_train.head(2)

  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_train[col] = data_train[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_test[col] = data_test[col].replace({True : 1, False : 0}).astype(int)
  data_train.CryoSleep = data_train.CryoSleep.replace({False : 0

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,1
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1,0,0,0,0,1


In [50]:
for col in ['Age', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck']:
    data_train[[col]] = MinMaxScaler().fit_transform(data_train[[col]])
    data_test[[col]] = MinMaxScaler().fit_transform(data_test[[col]])

In [51]:
X_train, X_valid, y_train, y_valid = train_test_split(
    data_train.iloc[:, data_train.columns != data_train.columns[8]],
    data_train.iloc[:,[8]],
    test_size=0.25
)

In [52]:
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def objective(trial):
  params = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
      'max_depth': trial.suggest_int('max_depth', 3, 10),
      'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
      'subsample': trial.suggest_float('subsample', 0.5, 1.0),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
      'gamma': trial.suggest_float('gamma', 0, 5),
      'alpha': trial.suggest_float('alpha', 0, 10),
      'lambda': trial.suggest_float('lambda', 0, 10),
  }
  boosting = xgb.XGBClassifier(
      **params,
      random_state=42,
      use_label_encoder=False,
      eval_metric='logloss'
  )
  boosting.fit(X_train, y_train)
  predictions = boosting.predict(X_valid)
  return mean_squared_error(predictions, y_valid)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [53]:
best_params = study.best_params
boosting = xgb.XGBClassifier(**best_params, random_state=42, use_label_encoder=False)
boosting.fit(X_train, y_train)
None

In [54]:
print(
    f'MSE: {mean_squared_error(boosting.predict(X_valid), y_valid)}'
)

MSE: 0.23816425120772947


In [55]:
X_test = data_test.iloc[:,1:]

In [56]:
preds_test = boosting.predict(X_test)
output = pd.DataFrame({'PassengerId': data_test['PassengerId'],
                       'Transported': preds_test})
output.to_csv('submission.csv', index=False)