In [81]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
SEED =42
# models from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
# Models Evaluations
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from sklearn.metrics import confusion_matrix , classification_report,accuracy_score
from sklearn.metrics import  precision_score,recall_score , f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("data/spaceship-titanic data/train.csv")
df_test = pd.read_csv("data/spaceship-titanic data/test.csv")

In [3]:
df.shape , df_test.shape

((8693, 14), (4277, 13))

In [4]:
df_train = df["Transported"]

In [6]:
df.drop("Transported",inplace =True,axis = 1)

In [7]:
df_combined = pd.concat([df , df_test] , axis = 0)

In [8]:
df_combined.shape

(12970, 13)

In [9]:
df_combined["HomePlanet"] = df_combined["HomePlanet"].fillna("Unknown")


In [38]:
df_combined.isna().sum()

HomePlanet        0
CryoSleep         0
Destination     274
Age               0
VIP               4
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Deck            312
Room Number     312
Side            312
dtype: int64

In [21]:
#df_combined . drop("Name",axis = 1 , inplace =True)
#df_combined . drop("PassengerId",axis = 1 , inplace =True)

In [32]:
expense_col = ["RoomService" , "ShoppingMall","Spa","VRDeck","FoodCourt"]
vip_col = ["VIP"]
condition1 = (df_combined["CryoSleep"] == True)
df_combined.loc[~condition1 ,expense_col ] = df_combined.loc[~condition1 ,expense_col ].fillna(-10)
df_combined.loc[~condition1 ,vip_col ] = df_combined.loc[~condition1 ,vip_col ].fillna(True)
condition2 = (df_combined[expense_col].eq(0).all(axis = 1))
df_combined.loc[~condition2 ,"CryoSleep" ] = df_combined.loc[~condition2 ,"CryoSleep" ].fillna(False)

  df_combined.loc[~condition1 ,vip_col ] = df_combined.loc[~condition1 ,vip_col ].fillna(True)
  df_combined.loc[~condition2 ,"CryoSleep" ] = df_combined.loc[~condition2 ,"CryoSleep" ].fillna(False)


In [36]:
cabin_cols = df["Cabin"].str.split('/', expand = True)
cabin_cols.columns = ["Deck" , "Room Number", "Side"]
new_df = pd.concat([df_combined , cabin_cols] , axis = 1)
df_combined = new_df 
df_combined = df_combined.drop("Cabin" ,axis = 1)

In [39]:
df_combined.dtypes

HomePlanet       object
CryoSleep        object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Deck             object
Room Number      object
Side             object
dtype: object

In [43]:
df_combined["Room Number"] 

0         0
1         0
2         0
3         0
4         1
       ... 
4272    298
4273    853
4274    937
4275    143
4276    743
Name: Room Number, Length: 12970, dtype: object

In [20]:
df_combined["Age"] = df_combined["Age"].fillna(df.Age.median())

In [64]:
train_data.dtypes

HomePlanet       object
CryoSleep        object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Deck             object
Room Number     float64
Side             object
dtype: object

In [66]:
numerical_features = ['Room Number', 'Age', 'RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck']
categorical_features = ['HomePlanet','CryoSleep','Destination','VIP','Deck', 'Side']

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
new_preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

In [53]:
train_data = df_combined.iloc[:len(df)]
test_data = df_combined.iloc[len(df) : len(df) + len(df_test)]

In [54]:
train_data.shape , test_data.shape


((8693, 13), (4277, 13))

In [55]:
y = df_train

In [59]:
train_data.shape

(8693, 13)

In [63]:
train_data["Room Number"] = train_data["Room Number"].astype(float)
test_data["Room Number"] = test_data["Room Number"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Room Number"] = train_data["Room Number"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["Room Number"] = test_data["Room Number"].astype(float)


In [57]:
y.dtype

dtype('bool')

In [69]:
transformed_data_train = new_preprocessor.fit_transform(train_data)
transformed_data_test = new_preprocessor.transform(test_data)

feature_names = new_preprocessor.named_transformers_['categorical'] \
     .named_steps['encoder'].get_feature_names_out(input_features=categorical_features) 

all_feature_names = numerical_features + list(feature_names)
all_feature_names


['Room Number',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'HomePlanet_Unknown',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_False',
 'VIP_True',
 'Deck_A',
 'Deck_B',
 'Deck_C',
 'Deck_D',
 'Deck_E',
 'Deck_F',
 'Deck_G',
 'Deck_T',
 'Side_P',
 'Side_S']

In [70]:
transformed_train_df = pd.DataFrame(transformed_data_train ,columns = all_feature_names )
transformed_test_df = pd.DataFrame(transformed_data_test ,columns = all_feature_names )

In [77]:
# First GBM Model
gbm_model_1 = GradientBoostingClassifier(n_estimators=70, learning_rate=0.1, max_features='sqrt', max_depth=5, random_state=SEED, 
                                        min_samples_split=2, min_samples_leaf=3, loss='exponential', subsample=0.5)

# Second GBM Model
gbm_model_2 = GradientBoostingClassifier(n_estimators=70, learning_rate=0.1, max_features='log2', max_depth=5, random_state=SEED, 
                                        min_samples_split=2, min_samples_leaf=3, subsample=0.5, loss='log_loss')

# Third GBM Model
gbm_model_3 = GradientBoostingClassifier(n_estimators=70, learning_rate=0.1, max_features='log2', max_depth=5, random_state=SEED, 
                                        min_samples_split=2, min_samples_leaf=3, subsample=0.5, loss='exponential')

In [79]:
X_train , X_test , y_train ,y_test = train_test_split(transformed_train_df ,y, test_size =0.2 , random_state=SEED)


In [82]:
gbm_model_1.fit(X_train, y_train)
gbm_model_2.fit(X_train, y_train)
gbm_model_3.fit(X_train, y_train)

# Run through predictions initially to train the Logistic Regression for predicting
gbm_1_predictions = gbm_model_1.predict(X_test)
gbm_2_predictions = gbm_model_2.predict(X_test)
gbm_3_predictions = gbm_model_3.predict(X_test)


# Stack both predictions to form a training set for the Logistic Regression
stacked_features = np.column_stack((gbm_1_predictions, gbm_2_predictions, gbm_3_predictions))

# Fit the meta model to stacked features
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_test)

# Predict yet again to get a new metric
gbm_1_base_preds = gbm_model_1.predict(X_test)
gbm_2_base_preds = gbm_model_2.predict(X_test)
gbm_3_base_preds = gbm_model_3.predict(X_test)


# Stack both predictions yet again
stacked_base_preds = np.column_stack((gbm_1_base_preds, gbm_2_base_preds, gbm_3_base_preds))

# Get final predictions to gauge overall performance
ensemble_predictions = meta_model.predict(stacked_base_preds)

# Score the predictions and print them
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.7918343875790684


In [83]:
# Get new base predictions for actual test set
gbm_1_base_preds = gbm_model_1.predict(transformed_test_df)
gbm_2_base_preds = gbm_model_2.predict(transformed_test_df)
gbm_3_base_preds = gbm_model_3.predict(transformed_test_df)

# Combine both predictions for the meta model to predict
stacked_base_preds = np.column_stack((gbm_1_base_preds, gbm_2_base_preds, gbm_3_base_preds))

In [84]:
gbm_1_base_preds = gbm_model_1.predict(transformed_test_df)
gbm_2_base_preds = gbm_model_2.predict(transformed_test_df)
gbm_3_base_preds = gbm_model_3.predict(transformed_test_df)

# Combine both predictions for the meta model to predict
stacked_base_preds = np.column_stack((gbm_1_base_preds, gbm_2_base_preds, gbm_3_base_preds))

In [85]:
ensemble_predictions = meta_model.predict(stacked_base_preds)

In [86]:
output = pd.DataFrame({"PassengerId" : df_test["PassengerId"] , 
                       "Transported" : ensemble_predictions})

In [87]:
output.to_csv("submission.csv" , index = False)