In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#PP

# Import

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

# Data understanding and cleaning

In [None]:
# Read the data
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

df.head()

In [None]:
num_of_obj, _ = df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
def analyse_na(data_frame):
    
    list_with_columns_with_na = []
    list_with_count_of_na = []
    list_with_fraction_of_na = []
    list_with_types_of_columns = []
    
    try:
        for column in data_frame.columns:
            num_of_miss_values = data_frame[column].isna().sum()
            if num_of_miss_values:
                fraction_of_null = round(num_of_miss_values / num_of_obj, 3)
                list_with_columns_with_na.append(column)
                list_with_count_of_na.append(num_of_miss_values)
                list_with_fraction_of_na.append(fraction_of_null)
                list_with_types_of_columns.append(data_frame[column].dtype)
    except:
        print('Check ur data!!!')
        
    df_to_return = pd.DataFrame({
        'columns': list_with_columns_with_na,
        'count_of_na': list_with_count_of_na,
        'fraction_of_na': list_with_fraction_of_na,
        'type': list_with_types_of_columns
    })
    
    df_to_return = df_to_return.sort_values('count_of_na', ascending=False).reset_index(drop=True)
    
    return df_to_return

In [None]:
for column in df.select_dtypes(include=['float64']):
    df.hist(column, bins=80)
    print()

In [None]:
df.pivot_table(index='CryoSleep')

In [None]:
names_of_columns = analyse_na(df)
names_of_columns

In [None]:
def split_cabin(x):
    if len(str(x).split('/')) < 3:
        return ['Missing', 'Missing', "Missing"]
    else:   
        return str(x).split('/')

In [None]:
def clean(data):
    data.drop('Name', axis=1, inplace=True)
    
    data['ShoppingMall'].fillna(0,inplace=True)
    data['VRDeck'].fillna(0,inplace=True)
    data['FoodCourt'].fillna(0,inplace=True)
    data['Spa'].fillna(0,inplace=True)
    data['RoomService'].fillna(0,inplace=True)
    
    data['TempCabin'] = data['Cabin'].apply(lambda x: split_cabin(x))
    data['Deck'] = data['TempCabin'].apply(lambda x: x[0])
    data['Side'] = data['TempCabin'].apply(lambda x: x[2])
    data.drop(['TempCabin', 'Cabin'], axis=1, inplace=True) 
    
    data['VIP'].fillna('Missing', inplace=True)
    data['HomePlanet'].fillna('Missing', inplace=True)
    data['Destination'].fillna('Missing', inplace=True)
    
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Category_age'] = data['Age'].apply(lambda x: x // 10)
    data.drop('Age', axis=1, inplace=True)
    
    data.loc[(data['CryoSleep'].isna()) & (data['ShoppingMall'] + data['VRDeck'] + \
            data['FoodCourt'] + data['Spa'] + data['RoomService'] == 0), 'CryoSleep'] = \
    data.loc[(data['CryoSleep'].isna()) & (data['ShoppingMall'] + data['VRDeck'] + \
            data['FoodCourt'] + data['Spa'] + data['RoomService'] == 0), 'CryoSleep'].fillna(True)
    data['CryoSleep'].fillna(False, inplace=True)

In [None]:
def combo_info(data):
    print('First 5 indexes:')
    display(cdf.head())
    print()
    print('analyse_na:')
    display(analyse_na(cdf))
    print()
    print('Info:')
    display(data.info())

In [None]:
cdf = df.copy()
clean(cdf)
combo_info(cdf)

# Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score

In [None]:
X = cdf.drop(['Transported', 'PassengerId'], axis=1)
X = pd.get_dummies(X)
y = cdf['Transported']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print('x_train:', x_train.shape, 'y_train:', y_train.shape)

In [None]:
# from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import KFold

In [None]:
def stats_of_test(model):
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test, predict)
    precision = precision_score(y_test, predict)
    recall = recall_score(y_test, predict)
    return [accuracy, precision, recall]

In [None]:
def train_model(model, splitter, x_data, y_data):
    
    list_with_acc = []
    list_with_pre = []
    list_with_rec = []
    
    for train_index, test_index in splitter.split(x_data):
       
        
        x_train, x_test = x_data[train_index], x_data[test_index] 
        y_train, y_test = y_data[train_index], y_data[test_index]
        
        
        model.fit(x_train, y_train)
        
        predict = model.predict(x_test)
        
        accuracy = accuracy_score(y_test, predict)
        precision = precision_score(y_test, predict)
        recall = recall_score(y_test, predict)
        
        list_with_acc.append(accuracy)
        list_with_pre.append(precision)
        list_with_rec.append(recall)

        
    return [sum(list_with_acc) / len(list_with_acc), sum(list_with_pre) / len(list_with_pre), sum(list_with_rec) / len(list_with_rec)] 

In [None]:
def plot_roc_auc(model):
    y_pred = model.predict_proba(x_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc = round(roc_auc_score(y_test, y_pred), 4)
    plt.figure(figsize=(15, 10))
    plt.plot(fpr, tpr, label="Stacking, AUC=" + str(auc))
    plt.plot([0, 1], [0, 1])
    plt.legend()
    plt.show()

In [None]:
gbc_model = GradientBoostingClassifier().fit(x_train, y_train.astype('int8'))

stats_of_test(gbc_model)

In [None]:
plot_roc_auc(gbc_model)

#Trying multiple models

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat_model = CatBoostClassifier(loss_function='Logloss',
    verbose=0).fit(x_train, y_train.astype('int8'))

stats_of_test(cat_model)

In [None]:
cat_model.get_feature_importance(prettified=True)

In [None]:
plot_roc_auc(cat_model)

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb_cl = xgb.XGBClassifier(n_estimators=10).fit(x_train, y_train)

stats_of_test(xgb_model)

In [None]:
plot_roc_auc(xgb_model)

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(x_train, y_train.astype('int8'))

In [None]:
stats_of_test(lgbm)

In [None]:
plot_roc_auc(lgbm)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train.astype('int8'))

stats_of_test(knn)

In [None]:
plot_roc_auc(knn)

In [None]:
from sklearn.ensemble import BaggingClassifier


baggingClf = BaggingClassifier(base_estimator=CatBoostClassifier(verbose=0), 
                               n_estimators=50, 
                               random_state=12)

baggingClf.fit(x_train, y_train.astype('int8').ravel())

In [None]:
stats_of_test(baggingClf)

In [None]:
plot_roc_auc(baggingClf)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


estimators = [('lr', LogisticRegression()), 
              ('boosting', CatBoostClassifier(verbose=0)),
              ('knn', KNeighborsClassifier()),
              ('lgbm', LGBMClassifier())]
stackingClf = StackingClassifier(estimators=estimators, 
                                 final_estimator=SVC(probability=True))

stackingClf.fit(x_train, y_train.astype('int8').ravel())

In [None]:
stats_of_test(stackingClf)

In [None]:
plot_roc_auc(stackingClf)

In [None]:
from sklearn.ensemble import RandomForestClassifier


estimators_1 = [('lr', LogisticRegression()), 
              ('boosting', CatBoostClassifier(verbose=0)),
              ('knn', KNeighborsClassifier()),
              ('lgbm', LGBMClassifier()),
              ('forest', RandomForestClassifier())]
stackingClf_1 = StackingClassifier(estimators=estimators_1, 
                                 final_estimator=SVC(probability=True))

stackingClf_1.fit(x_train, y_train.astype('int8').ravel())

In [None]:
stats_of_test(stackingClf_1)

In [None]:
plot_roc_auc(stackingClf_1)

In [None]:
estimators_2 = [('lr', LogisticRegression()), 
              ('boosting', GradientBoostingClassifier()),
              ('forest', RandomForestClassifier()),
              ('knn', KNeighborsClassifier()),
              ('cat', CatBoostClassifier(verbose=0))]
stackingClf_2 = StackingClassifier(estimators=estimators_2, 
                                 final_estimator=SVC(probability=True))

stackingClf_2.fit(x_train, y_train.astype('int8').ravel())

In [None]:
stats_of_test(stackingClf_2)

In [None]:
plot_roc_auc(stackingClf_2)

In [None]:
estimators_3 = [('lr', LogisticRegression()), 
              ('boosting', GradientBoostingClassifier()),
              ('forest', RandomForestClassifier()),
              ('knn', KNeighborsClassifier()),
              ('lgbm', LGBMClassifier())]
stackingClf_3 = StackingClassifier(estimators=estimators_3, 
                                 final_estimator=SVC(probability=True))

stackingClf_3.fit(x_train, y_train.astype('int8').ravel())

In [None]:
stats_of_test(stackingClf_3)

In [None]:
plot_roc_auc(stackingClf_3)

In [None]:
import pickle

with open('gradientboosted.pkl','wb') as f:
    pickle.dump(stackingClf_2, f)
    
with open('gradientboosted.pkl','rb') as f:
    reloaded_model = pickle.load(f)

# Test dataframe

In [None]:
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_df.head()

In [None]:
test_cdf = test_df.copy()
clean(test_cdf)
combo_info(test_cdf)

In [None]:
test_X = pd.get_dummies(test_cdf.drop('PassengerId', axis=1))

In [None]:
end_predict = reloaded_model.predict(test_X)

In [None]:
pred = list(map(lambda x: False if x == 0 else True, end_predict))
output = pd.DataFrame({'PassengerId': test_cdf.PassengerId,
                       'Transported': pred})
output.to_csv('submission.csv', index=False)