In [200]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

random_state = 0

In [201]:
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.datasets import make_circles, make_classification, make_moons

cat = CatBoostClassifier(verbose=False, random_state=random_state)
lgbm = LGBMClassifier(random_state=random_state)
xgb = XGBClassifier(random_state=random_state)
xgbrf = XGBRFClassifier(random_state=random_state)

models = []

models.append(('CAT', cat))
models.append(('LGBM', lgbm))
models.append(('XGB', xgb))
models.append(('XGBRF', xgbrf))


In [202]:
og_data = train_data = pd.read_csv('../dset/spaceship-titanic/train.csv')
test_data = pd.read_csv('../dset/spaceship-titanic/test.csv')
test_id = test_data['PassengerId']
# train_data = og_data.sample(frac=0.75)
# eval_data = og_data.drop(train_data.index, axis=0)
# print(train_data.shape)
# print(eval_data.shape)
# print(test_data.shape)

In [203]:
#train_data.info()
unique_values_csv = pd.Series(train_data['Transported'].unique())
#unique_values_csv.to_csv('./passenger_Id.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [204]:
def transform_bools(dfs, tr_arr, tst_arr):
    for i in tr_arr:
        dfs[0][i] = dfs[0][i].replace({False: 0, True: 1})
        
    for i in tst_arr:
        dfs[1][i] = dfs[1][i].replace({False: 0, True: 1})

def get_num_cols(df):
    num_col = [col for col in train_data if train_data[col].dtype in ['int64', 'float64']]
    return num_col;

def split_df(dfs, to_split, from_split, split_factor, to_drop):
    for df in dfs:
        df[to_split] = df[from_split].str.split(split_factor, expand=True)
        df.drop([from_split, to_drop], inplace=True, axis=1)


In [205]:
transform_bools([train_data,test_data], ['Transported', 'VIP', 'CryoSleep'], ['VIP', 'CryoSleep'])

In [206]:
num_col = get_num_cols(train_data) 
obj_col = [col for col in train_data if col not in num_col]

In [207]:
def corr_mat(data, target):
    corrmat_def = data.corr()
    top_corr = corrmat_def.nlargest(15, target)[target]
    return top_corr.sort_values(ascending=False)

In [208]:
corrmat = corr_mat(train_data[num_col], 'Transported')
print(corrmat)

Transported     1.000000
CryoSleep       0.468645
FoodCourt       0.046566
ShoppingMall    0.010141
VIP            -0.037650
Age            -0.075026
VRDeck         -0.207075
Spa            -0.221131
RoomService    -0.244611
Name: Transported, dtype: float64


In [209]:
split_df([train_data, test_data], ['Deck', 'Cabin_No', 'Side'], 'Cabin', '/', 'Cabin_No')
split_df([train_data, test_data], ['P_ID', 'Group_ID'], 'PassengerId', '_', 'P_ID')

print(train_data.columns, test_data.columns)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported',
       'Deck', 'Side', 'Group_ID'],
      dtype='object') Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Deck', 'Side',
       'Group_ID'],
      dtype='object')


In [210]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# y = train_data['Transported']
# train_data.drop('Transported', axis=1, inplace=True)
# train_data_0 = train_data
# train_data['Cabin_No'] = train_data['Cabin_No'].astype(float)
# num_col = get_num_cols(train_data)
# 
# knn = KNNImputer(n_neighbors=3)
# imputed_train = train_data[num_col]
# imputed_train = knn.fit_transform(imputed_train)
# imputed_train = pd.DataFrame(imputed_train)
# imputed_train.columns = train_data[num_col].columns
# train_data[num_col] = imputed_train[num_col]

train_data.to_csv('./splitted_df.csv')

In [211]:
def drop_col(df, to_drop, axis = 1, inplace = True):
    df.drop(to_drop, axis=axis, inplace=inplace)

drop_col(train_data, 'Name')
drop_col(test_data, 'Name')

In [212]:
train_data['Group_ID'].astype(int)
test_data['Group_ID'].astype(int)

obj_col = [col for col in train_data.columns if train_data[col].dtype in ['object']]
print(obj_col)

['HomePlanet', 'Destination', 'Deck', 'Side', 'Group_ID']


In [213]:
#dropping room service etc
train_data_1 = train_data.copy()
#drop_col(train_data, ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_ID'])
#drop_col(test_data, ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_ID'])
#sum of sepndings


In [214]:
def round_ages(dfs, base):
    def rnd(x):
        return base * round(x/base)
    
    for df in dfs:
        df['Age'] = rnd(df['Age'])
        df['CryoSleep'] = np.round(df['CryoSleep'])
    
#round_ages([train_data,test_data], 5)
#train_data.to_csv('./splitted_df.csv')

In [215]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score

itm = IterativeImputer(random_state=0)

In [216]:
y = train_data['Transported']
X = train_data.drop('Transported', axis=1)
X = one_hot_tr = pd.get_dummies(X, drop_first=True)
X = itm.fit_transform(X)

In [217]:
X_test = pd.get_dummies(test_data, drop_first=True)
X_test = itm.fit_transform(X_test)

In [218]:
round_ages([train_data,test_data], 5)

In [219]:
# for name, model in models:
#     scores = cross_val_score(model, X, train_data['Transported'], scoring='accuracy')
#     print(f'{name}\t:\t{np.mean(scores).round(4)}')

In [220]:
cat.fit(X, y)
pred = cat.predict(X_test)
submission = pd.DataFrame({'PassengerId':test_id, 'Transported':pred.astype(bool)})

submission.to_csv('./submission.csv', index=False)

In [221]:
imp_ft = pd.Series(cat.feature_importances_, index=one_hot_tr.columns)

In [222]:
imp_ft.sort_values(ascending=False).to_csv('./imp_feature.csv')