In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from joblib import dump, load

In [3]:
kaggle = pd.read_csv('../Data/test.csv')
kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [4]:
kaggle.shape

(4277, 13)

In [5]:
kaggle.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

#### Leverage iterative imputer to impute missing numerical values of the kaggle dataset

In [6]:
imputer = load(open('../Data/iterative_imputer_mean.pkl', 'rb'))

In [7]:
object_cols = kaggle.select_dtypes(include='object').columns

In [8]:
for col in object_cols:
    kaggle[col] = kaggle[col].fillna('*MISSING*')
    
kaggle.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age              91
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name              0
dtype: int64

In [9]:
missing_names = kaggle[kaggle['Name'] == '*MISSING*'].index

In [10]:
valid_names = [ind for ind in kaggle.index if ind not in missing_names]

In [11]:
for ind in missing_names:
    kaggle.loc[ind, 'FirstName'] = kaggle.loc[ind, 'LastName'] = '*MISSING*'
for ind in valid_names:
    split_name = kaggle.loc[ind, 'Name'].split()
    kaggle.loc[ind, 'FirstName'] = split_name[0]
    kaggle.loc[ind, 'LastName'] = split_name[1]

kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,FirstName,LastName
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,Lerome,Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,Meratz,Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,Brence,Harperez


In [12]:
missing_cabins = kaggle[kaggle['Cabin'] == '*MISSING*'].index

In [13]:
valid_cabins = [ind for ind in kaggle.index if ind not in missing_cabins]

In [14]:
for ind in missing_cabins:
    kaggle.loc[ind, 'CabinSector'] = kaggle.loc[ind, 'CabinRoom'] = kaggle.loc[ind, 'CabinClass'] = '*MISSING*'
for ind in valid_cabins:
    split_cabin = kaggle.loc[ind, 'Cabin'].split('/')
    kaggle.loc[ind, 'CabinSector'] = split_cabin[0]
    kaggle.loc[ind, 'CabinRoom'] = split_cabin[1]
    kaggle.loc[ind, 'CabinClass'] = split_cabin[2]

kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,Carsoning,G,3,S
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,Lerome,Peckers,F,4,S
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,Unhearfus,C,0,S
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,Meratz,Caltilter,C,1,S
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,Brence,Harperez,F,5,S


In [15]:
to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinSector', 'CabinClass']
to_drop = ['FirstName', 'LastName', 'CabinRoom']

In [16]:
kaggle_intermed = kaggle.copy()

kaggle_intermed = kaggle_intermed.drop(columns=(['PassengerId', 'Cabin', 'Name'] + to_drop))
kaggle_intermed.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinSector,CabinClass
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S


In [17]:
kaggle_intermed['VIP'].value_counts()

False        4110
*MISSING*      93
True           74
Name: VIP, dtype: int64

In [18]:
kaggle_intermed = pd.get_dummies(kaggle_intermed, columns=to_dummy, drop_first=False)
kaggle_intermed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_*MISSING*,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,27.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
1,19.0,0.0,9.0,0.0,2823.0,0.0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,38.0,0.0,6652.0,0.0,181.0,585.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,20.0,10.0,0.0,635.0,0.0,0.0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1


In [19]:
kaggle_intermed['VIP_*MISSING*'].value_counts()

0    4184
1      93
Name: VIP_*MISSING*, dtype: int64

In [20]:
kaggle_intermed['VIP_False'].value_counts()

1    4110
0     167
Name: VIP_False, dtype: int64

In [21]:
kaggle_intermed['VIP_True'].value_counts()

0    4203
1      74
Name: VIP_True, dtype: int64

In [22]:
imputed_kaggle = imputer.transform(kaggle_intermed)
imputed_kaggle.shape

(4277, 38)

In [23]:
null_columns = kaggle_intermed.loc[:, 'Age':'VRDeck'].columns
len(null_columns)

6

In [24]:
flag_columns = [label + '_imputed' for label in null_columns]
flag_columns

['Age_imputed',
 'RoomService_imputed',
 'FoodCourt_imputed',
 'ShoppingMall_imputed',
 'Spa_imputed',
 'VRDeck_imputed']

In [25]:
kaggle_intermed.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_*MISSING*', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'CryoSleep_False', 'CryoSleep_True',
       'CryoSleep_*MISSING*', 'Destination_*MISSING*',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'VIP_*MISSING*',
       'CabinSector_*MISSING*', 'CabinSector_A', 'CabinSector_B',
       'CabinSector_C', 'CabinSector_D', 'CabinSector_E', 'CabinSector_F',
       'CabinSector_G', 'CabinSector_T', 'CabinClass_*MISSING*',
       'CabinClass_P', 'CabinClass_S'],
      dtype='object')

In [26]:
len(list(kaggle_intermed.columns))

32

In [27]:
imputed_kaggle = pd.DataFrame(imputed_kaggle, columns=(list(kaggle_intermed.columns) + flag_columns))
imputed_kaggle.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_*MISSING*,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S,Age_imputed,RoomService_imputed,FoodCourt_imputed,ShoppingMall_imputed,Spa_imputed,VRDeck_imputed
0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19.0,0.0,9.0,0.0,2823.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38.0,0.0,6652.0,0.0,181.0,585.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.0,10.0,0.0,635.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
imputed_kaggle['VIP_False'].value_counts()

1.0    4110
0.0     167
Name: VIP_False, dtype: int64

In [29]:
for variable in null_columns:
    kaggle[variable] = imputed_kaggle[variable]

#### Numerical values for the kaggle dataset have been updated with the imputed values

In [30]:
kaggle.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
FirstName       0
LastName        0
CabinSector     0
CabinRoom       0
CabinClass      0
dtype: int64

In [37]:
kaggle['TotalSpend'] = kaggle.loc[:, 'RoomService':'VRDeck'].sum(axis=1)
column_to_move = kaggle.pop('TotalSpend')
kaggle.insert(12, 'TotalSpend', column_to_move)
kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,Name,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,Carsoning,G,3,S
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,2832.0,Lerome Peckers,Lerome,Peckers,F,4,S
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,Unhearfus,C,0,S
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,7418.0,Meratz Caltilter,Meratz,Caltilter,C,1,S
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,645.0,Brence Harperez,Brence,Harperez,F,5,S


In [38]:
columns_to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinSector', 'CabinClass']

In [39]:
kaggle_dummified = pd.get_dummies(kaggle, columns=columns_to_dummy, drop_first=False)
kaggle_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,Name,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,0013_01,G/3/S,27.0,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,...,0,0,0,0,0,1,0,0,0,1
1,0018_01,F/4/S,19.0,0.0,9.0,0.0,2823.0,0.0,2832.0,Lerome Peckers,...,0,0,0,0,1,0,0,0,0,1
2,0019_01,C/0/S,31.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,...,0,1,0,0,0,0,0,0,0,1
3,0021_01,C/1/S,38.0,0.0,6652.0,0.0,181.0,585.0,7418.0,Meratz Caltilter,...,0,1,0,0,0,0,0,0,0,1
4,0023_01,F/5/S,20.0,10.0,0.0,635.0,0.0,0.0,645.0,Brence Harperez,...,0,0,0,0,1,0,0,0,0,1


In [40]:
def get_scaled_columns(df):
    return pd.DataFrame(df[to_scale])

def get_non_scaled_columns(df):
    return df[categoricals]

In [41]:
kaggle_dummified.columns[:15]

Index(['PassengerId', 'Cabin', 'Age', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend', 'Name', 'FirstName',
       'LastName', 'CabinRoom', 'HomePlanet_*MISSING*', 'HomePlanet_Earth'],
      dtype='object')

In [44]:
to_scale = list(kaggle_dummified.loc[:, 'Age':'TotalSpend'].columns)
to_scale

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend']

In [45]:
kaggle_dummified.columns

Index(['PassengerId', 'Cabin', 'Age', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend', 'Name', 'FirstName',
       'LastName', 'CabinRoom', 'HomePlanet_*MISSING*', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_False',
       'CryoSleep_True', 'CryoSleep_*MISSING*', 'Destination_*MISSING*',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'VIP_*MISSING*',
       'CabinSector_*MISSING*', 'CabinSector_A', 'CabinSector_B',
       'CabinSector_C', 'CabinSector_D', 'CabinSector_E', 'CabinSector_F',
       'CabinSector_G', 'CabinSector_T', 'CabinClass_*MISSING*',
       'CabinClass_P', 'CabinClass_S'],
      dtype='object')

In [47]:
categoricals = list(kaggle_dummified.loc[:, 'HomePlanet_*MISSING*':].columns)
categoricals

['HomePlanet_*MISSING*',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_False',
 'CryoSleep_True',
 'CryoSleep_*MISSING*',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_False',
 'VIP_True',
 'VIP_*MISSING*',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [50]:
get_scaled_transformer = FunctionTransformer(get_scaled_columns)
get_non_scaled_transformer = FunctionTransformer(get_non_scaled_columns)

In [54]:
pipe_scaler =  Pipeline([
    ('scale_transform', get_scaled_transformer),
    ('ss', StandardScaler())
])

union = FeatureUnion([
    ('vect_pipe', pipe_scaler),
    ('other_cols', get_non_scaled_transformer)
])

union_res = union.fit_transform(kaggle_dummified)

In [55]:
kaggle_scaled = pd.DataFrame(union_res, columns=to_scale+list(categoricals))
kaggle_scaled

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,HomePlanet_*MISSING*,HomePlanet_Earth,HomePlanet_Europa,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,-0.118360,-0.364123,-0.289172,-0.318360,-0.273804,-0.250659,-0.510517,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.688177,-0.364123,-0.283222,-0.318360,2.280577,-0.250659,0.489512,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.166548,-0.364123,-0.289172,-0.318360,-0.273804,-0.250659,-0.510517,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.665138,-0.364123,4.108468,-0.318360,-0.110027,0.222464,2.108908,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.616950,-0.347509,-0.289172,0.826094,-0.273804,-0.250659,-0.282756,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.380230,-0.364123,-0.289172,-0.318360,-0.273804,-0.250659,-0.510517,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4273,0.950047,-0.364123,0.270780,-0.287721,-0.264756,-0.134198,-0.151043,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4274,-0.019206,-0.364123,-0.289172,-0.318360,-0.273804,-0.250659,-0.510517,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4275,0.526503,-0.364123,1.482577,-0.318360,-0.273804,0.172321,0.620518,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [56]:
model = load(open('../Output/logreg_cv_ii.joblib', 'rb'))

In [57]:
model.feature_names

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_*MISSING*',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_*MISSING*',
 'VIP_False',
 'VIP_True',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [58]:
training_columns = model.feature_names
training_columns

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_*MISSING*',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_*MISSING*',
 'VIP_False',
 'VIP_True',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [59]:
test_columns = kaggle_scaled.columns.tolist()
test_columns

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend',
 'HomePlanet_*MISSING*',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_False',
 'CryoSleep_True',
 'CryoSleep_*MISSING*',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_False',
 'VIP_True',
 'VIP_*MISSING*',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [60]:
def update_cols(df, target_cols):
    res = pd.DataFrame()
    for col in target_cols:
        if col in df:
            res[col] = df[col]
        else:
            res[col] = np.NaN
    return res

In [61]:
kaggle_matched = update_cols(kaggle_scaled, training_columns)

In [62]:
kaggle_matched.isnull().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
TotalSpend                   0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_*MISSING*          0
CryoSleep_False              0
CryoSleep_True               0
Destination_*MISSING*        0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_*MISSING*                0
VIP_False                    0
VIP_True                     0
CabinSector_*MISSING*        0
CabinSector_A                0
CabinSector_B                0
CabinSector_C                0
CabinSector_D                0
CabinSector_E                0
CabinSector_F                0
CabinSector_G                0
CabinSector_T                0
CabinClass_*MISSING*         0
CabinClass_P                 0
CabinClass_S                 0
dtype: i

In [138]:
kaggle_matched.isnull().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
TotalSpend                   0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_*MISSING*          0
CryoSleep_False              0
CryoSleep_True               0
Destination_*MISSING*        0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_*MISSING*                0
VIP_False                    0
VIP_True                     0
CabinSector_*MISSING*        0
CabinSector_A                0
CabinSector_B                0
CabinSector_C                0
CabinSector_D                0
CabinSector_E                0
CabinSector_F                0
CabinSector_G                0
CabinSector_T                0
CabinClass_*MISSING*         0
CabinClass_P                 0
CabinClass_S                 0
dtype: i

In [63]:
res = model.predict(kaggle_matched)

In [64]:
res_df = pd.DataFrame(res, index=kaggle['PassengerId'], columns=['Transported'])

In [65]:
res_df['Transported'] = res_df['Transported'].astype(bool)

In [66]:
res_df.to_csv('../Output/submission_1.csv')

In [67]:
pd.read_csv('../Output/submission_1.csv', index_col='PassengerId')

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


Kaggle Score: .79050

#### Random Forrest Predictions

In [68]:
model_rf_gs = load(open('../Output/rf_gs_ii.joblib', 'rb'))

In [69]:
res_rf_gs = model_rf_gs.predict(kaggle_matched)

In [70]:
res_rf_gs_df = pd.DataFrame(res_rf_gs, index=kaggle['PassengerId'], columns=['Transported'])

In [71]:
res_rf_gs_df['Transported'] = res_rf_gs_df['Transported'].astype(bool)

In [72]:
res_rf_gs_df.to_csv('../Output/submission_rf_gs.csv')

In [73]:
pd.read_csv('../Output/submission_rf_gs.csv', index_col='PassengerId')

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


Kaggle Score: .79565

#### AdaBoost Predictions

In [74]:
lable = '../Output/submission_rf_gs.csv'.split('/')[-1]

In [75]:
lable.rsplit('.', maxsplit=1)[0]

'submission_rf_gs'

In [76]:
def run_new_model(model_path):
    submission_name = model_path.split('/')[-1].rsplit('.', maxsplit=1)[0]
    model = load(open(model_path, 'rb'))
    model_res = model.predict(kaggle_matched) # Just leaving this hardcoded for now and specific to this processed input
    model_res_df = pd.DataFrame(model_res, index=kaggle['PassengerId'], columns=['Transported'])
    model_res_df['Transported'] = model_res_df['Transported'].astype(bool)
    model_res_df.to_csv(f'../Output/submission_{submission_name}.csv')
    return pd.read_csv(f'../Output/submission_{submission_name}.csv', index_col='PassengerId')

In [77]:
gs_ab_res = run_new_model('../Output/gs_ab_ii.joblib')
gs_ab_res

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


Kaggle Score: .78957

#### Gradient Boost Predictions

In [78]:
gb_gs_res = run_new_model('../Output/gb_gs_ii.joblib')

In [79]:
gb_gs_res

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


Kaggle Score: .78957