# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

👉 EDA is done in this [notebook](https://www.kaggle.com/hasanbasriakcay/spaceship-titanic-eda-fe-baseline).

👉 Model Comparisons is one in this [notebook](https://www.kaggle.com/hasanbasriakcay/spaceship-titanic-pycaret-model-comparisons/notebook).

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter("ignore")
df_train = pd.read_csv("../input/spaceship-titanic/train.csv")
df_test = pd.read_csv("../input/spaceship-titanic/test.csv")
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

display(df_train.head())
display(df_test.head())
display(submission.head())

# Feature Engineering

In [None]:
def create_features(df):
    df["Cabin"] = df["Cabin"].fillna("None/None/None")
    df[["Deck", "Num", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df[['PassengerGroup', "PassengerNo"]] = df["PassengerId"].str.split("_", expand=True)
    
    fill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df[fill_cols] = df[fill_cols].fillna(0)
    
    df['TotalSpend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
        
    df['PctRoomService'] = df['RoomService']/df['TotalSpend']
    df['PctFoodCourt'] = df['FoodCourt']/df['TotalSpend']
    df['PctShoppingMall'] = df['ShoppingMall']/df['TotalSpend']
    df['PctSpa'] = df['Spa']/df['TotalSpend']
    df['PctVRDeck'] = df['VRDeck']/df['TotalSpend']
    
    fill_cols = ['PctRoomService', 'PctFoodCourt', 'PctShoppingMall', 'PctSpa', 'PctVRDeck']
    df[fill_cols] = df[fill_cols].fillna(0)
    
    df['VIP'] = df['VIP'].fillna(False)
    df['CryoSleep'] = df['CryoSleep'].fillna(False)
    df['HomePlanet'] = df['HomePlanet'].fillna('None')
    df['Destination'] = df['Destination'].fillna('None')
    df['Age'] = df['Age'].fillna(df.groupby('HomePlanet')['Age'].transform('median'))
    
    #########
    
    df_group = (df.groupby('PassengerGroup', as_index = False).agg(
            {'PassengerNo':'nunique',
                'VIP':lambda x: sum(x == True),
                'CryoSleep': lambda x: sum(x == True),
                'Cabin': 'nunique',
                'Deck': 'nunique',
                'Side': 'nunique',
                'HomePlanet': 'nunique',
                'Age': 'mean',
                'RoomService': 'mean',
                'FoodCourt': 'mean',
                'ShoppingMall':'mean',
                'Spa':'mean',
                'VRDeck': 'mean',
                'TotalSpend':'mean'})
          .rename(columns = {'PassengerNo':'Count'})
         )
    
    df_group['PctRoomService'] = df_group['RoomService']/df_group['TotalSpend']
    df_group['PctFoodCourt'] = df_group['FoodCourt']/df_group['TotalSpend']
    df_group['PctShoppingMall'] = df_group['ShoppingMall']/df_group['TotalSpend']
    df_group['PctSpa'] = df_group['Spa']/df_group['TotalSpend']
    df_group['PctVRDeck'] = df_group['VRDeck']/df_group['TotalSpend']
    fill_cols = ['PctRoomService', 'PctFoodCourt', 'PctShoppingMall', 'PctSpa', 'PctVRDeck']
    df_group[fill_cols] = df_group[fill_cols].fillna(0)
    
    df = df.merge(df_group, on="PassengerGroup", suffixes=('', '_Group'))
    
    return df, list(df_group.columns)

In [None]:
train, group_cols = create_features(df_train)
test, _ = create_features(df_test)

# Data Cleaning

In [None]:
drop_cols = ['PassengerNo', 'Name', 'PassengerGroup', 'Cabin']
# CryoSleep, Deck
test.drop(drop_cols, 1, inplace=True)
train.drop(drop_cols, 1, inplace=True)

In [None]:
train_dropna = train.dropna()
test_dropna = test.dropna()
train_dropna.reset_index(inplace=True)
test_dropna.reset_index(inplace=True)

# Pseudo Labeling

In [None]:
def pseudo_labeling(df_train, df_test, target, features, object_cols, th=0.999, fold=10):
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from catboost import CatBoostClassifier
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score
    
    X_train = df_train[features]
    X_test = df_test[features]
    y_train = df_train[[target]]
    
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(df_test))
    
    idx1 = X_train.index; idx2 = X_test.index
    
    skf = StratifiedKFold(n_splits=fold, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(X_train, y_train):
        clf = CatBoostClassifier(cat_features=object_cols, verbose=0)
        clf.fit(X_train.loc[train_index,:], y_train.loc[train_index, target], 
                eval_set = [(X_train.loc[test_index,:], y_train.loc[test_index, target])])
        oof[idx1[test_index]] = clf.predict_proba(X_train.loc[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(X_test)[:,1] / skf.n_splits
    
    pseudo_labeled_test = df_test.copy()
    pseudo_labeled_test[target + "_proba"] = preds
    pseudo_labeled_test = pseudo_labeled_test.loc[(preds<=(1-th)) | (preds>=th), :]
    pseudo_labeled_test[target] = np.where(pseudo_labeled_test[target + "_proba"] >= th, True, False)
    
    auc = roc_auc_score(df_train[target], oof)
    print('QDA scores CV =',round(auc,5), "- pseudo_label len =", len(pseudo_labeled_test))
    
    return pseudo_labeled_test

In [None]:
numeric_cols = test_dropna.select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(test_dropna.columns) - set(numeric_cols))

features = test_dropna.columns
pseudo_labeled_test = pseudo_labeling(train_dropna, test_dropna, "Transported", features, object_cols, th=0.99)
pseudo_labeled_test.head()

In [None]:
pseudo_labeled_test["Transported"].value_counts()

In [None]:
new_train = pd.concat([train, pseudo_labeled_test], join="inner")
new_train.head()

# Modelling

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
from pycaret.classification import *

numeric_cols = new_train.select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(new_train.columns) - set(numeric_cols))
object_cols.remove("Transported")
ignore_cols = ['PassengerId']
group_cols.remove("PassengerGroup")
group_cols = [col + '_Group' if col not in ['Count'] else col for col in group_cols]

clf = setup(data=new_train,
            target='Transported',
            train_size = 0.99,
            normalize = True,
            normalize_method = 'robust',
            create_clusters = True,
            #feature_interaction = True,
            numeric_features = numeric_cols,
            categorical_features = object_cols,
            ignore_features = ignore_cols,
            #group_features = group_cols,
            ignore_low_variance=True,
            remove_multicollinearity = True,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold_strategy = 'stratifiedkfold',
            fold = 10,
            n_jobs = -1)

In [None]:
model_catboost = create_model('catboost')
model_lightgbm = create_model('lightgbm')

top = [model_catboost, model_lightgbm]

In [None]:
#tuned_top = [tune_model(i, optimize='Accuracy', choose_better=True, n_iter=1000) for i in top]

In [None]:
tuned_catboost = tune_model(model_catboost, optimize='Accuracy', choose_better=True, n_iter=1000)

# Blending

In [None]:
#stack = stack_models(top, optimize='Accuracy')
#blend = blend_models(tuned_top, optimize='Accuracy')
#predict_model(blend);

In [None]:
final_blend = finalize_model(tuned_catboost)

In [None]:
plot_model(final_blend, plot='error')

In [None]:
plot_model(final_blend, plot = 'confusion_matrix')

# Submission

In [None]:
import gc
gc.collect()
unseen_predictions_blend = predict_model(final_blend, data=test)
unseen_predictions_blend.head()

In [None]:
assert(len(test.index)==len(unseen_predictions_blend))
sub = pd.DataFrame(list(zip(submission.PassengerId, unseen_predictions_blend.Label)),columns = ['PassengerId', 'Transported'])
sub.to_csv('submission_blend.csv', index = False)

In [None]:
def plot_preds_dist(df, preds, target, ax=None, title=''):
    train_test_preds = pd.DataFrame()
    train_test_preds['label'] = list(df[target]) + list(preds)
    train_test_preds['train_test'] = 'Test preds'
    train_test_preds.loc[0:len(df[[target]]), 'train_test'] = 'Training'
    
    if ax==None:
        fig, ax = plt.subplots(figsize=(16,3))
        sns.countplot(data=train_test_preds, x='label', hue='train_test', ax=ax)
        ax.set_title(title);
    else:
        sns.countplot(data=train_test_preds, x='label', hue='train_test', ax=ax)
        ax.set_title(title);

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plot_preds_dist(train, unseen_predictions_blend.Label, "Transported", title="Stack")