In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import dump, load

In [3]:
df = pd.read_csv('../Data/input_nulldrop.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,TotalSpend,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0.0,Maham,Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,736.0,Juanna,Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,10383.0,Altark,Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,5176.0,Solam,Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,1091.0,Willy,Santantines,F,1,S


In [4]:
columns_to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'LastName', 'CabinSector', 'CabinClass']

In [5]:
df_dummified = pd.get_dummies(df, columns=columns_to_dummy, drop_first=True)

In [6]:
df_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,...,LastName_Yorkland,LastName_Youngrayes,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_S
0,0001_01,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,...,0,0,1,0,0,0,0,0,0,0
1,0002_01,F/0/S,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,...,0,0,0,0,0,0,1,0,0,1
2,0003_01,A/0/S,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,...,0,0,0,0,0,0,0,0,0,1
3,0003_02,A/0/S,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,...,0,0,0,0,0,0,0,0,0,1
4,0004_01,F/1/S,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,...,0,0,0,0,0,0,1,0,0,1


In [7]:
df_dummified.isnull().sum().sum()

0

In [28]:
df_dummified.to_csv('../Data/input_nulldrop_dummified.csv', index=False)

### Simple Impute Dataset

In [9]:
df_si = pd.read_csv('../Data/input_simple_impute.csv')
df_si.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,ImputedValues,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0,Maham,Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,0,Juanna,Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,0,Altark,Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,0,Solam,Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,0,Willy,Santantines,F,1,S


In [10]:
columns_to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'LastName', 'CabinSector', 'CabinClass']

The names are causing problems when it comes to dummification, because there are really limitless possibilities of what could be present, and if a new name appears in the test set, that translates into a column that did not exist in the training data. I suppose I could either identify those new names in the test set and remove them prior to being fed into the model (or treat them as '*MISSING*'), or perhaps there would be value in bounding the set of names by only taking the first few letters. Not entirely sure what the best path is, but may just leave names out entirely in the beginning here.

In [11]:
df_si_dummified = pd.get_dummies(df_si, columns=columns_to_dummy, drop_first=True)

In [12]:
df_si_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,...,CabinSector_A,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_P,CabinClass_S
0,0001_01,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,...,0,1,0,0,0,0,0,0,1,0
1,0002_01,F/0/S,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,...,0,0,0,0,0,1,0,0,0,1
2,0003_01,A/0/S,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,...,1,0,0,0,0,0,0,0,0,1
3,0003_02,A/0/S,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,...,1,0,0,0,0,0,0,0,0,1
4,0004_01,F/1/S,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,...,0,0,0,0,0,1,0,0,0,1


In [13]:
df_si_dummified.isnull().sum().sum()

0

In [15]:
df_si_dummified.to_csv('../Data/input_si_dummified.csv', index=False)

### Iterative Imputer Dataset

In [16]:
df_ii = pd.read_csv('../Data/input_iterative_imputer.csv')
df_ii.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,Maham,Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,Juanna,Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,Altark,Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,Solam,Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,Willy,Santantines,F,1,S


In [17]:
df_ii.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
FirstName       0
LastName        0
CabinSector     0
CabinRoom       0
CabinClass      0
dtype: int64

In [18]:
columns_to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinSector', 'CabinClass']

In [19]:
df_ii_dummified = pd.get_dummies(df_ii, columns=columns_to_dummy, drop_first=False)

In [21]:
df_ii_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,0001_01,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,...,1,0,0,0,0,0,0,0,1,0
1,0002_01,F/0/S,24.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,...,0,0,0,0,1,0,0,0,0,1
2,0003_01,A/0/S,58.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,...,0,0,0,0,0,0,0,0,0,1
3,0003_02,A/0/S,33.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,...,0,0,0,0,0,0,0,0,0,1
4,0004_01,F/1/S,16.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,...,0,0,0,0,1,0,0,0,0,1


In [22]:
df_ii_dummified['TotalSpend'] = df_ii_dummified.loc[:, 'RoomService':'VRDeck'].sum(axis=1)

In [23]:
column_list = df_ii_dummified.columns.tolist()
column_list

['PassengerId',
 'Cabin',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported',
 'FirstName',
 'LastName',
 'CabinRoom',
 'HomePlanet_*MISSING*',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_*MISSING*',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_*MISSING*',
 'VIP_False',
 'VIP_True',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S',
 'TotalSpend']

In [24]:
column_list.remove('TotalSpend')
column_list.insert(8, 'TotalSpend')

In [25]:
df_ii_dummified = df_ii_dummified[column_list]

In [26]:
df_ii_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,Name,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,0001_01,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,1,0,0,0,0,0,0,0,1,0
1,0002_01,F/0/S,24.0,109.0,9.0,25.0,549.0,44.0,736.0,Juanna Vines,...,0,0,0,0,1,0,0,0,0,1
2,0003_01,A/0/S,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,Altark Susent,...,0,0,0,0,0,0,0,0,0,1
3,0003_02,A/0/S,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,Solam Susent,...,0,0,0,0,0,0,0,0,0,1
4,0004_01,F/1/S,16.0,303.0,70.0,151.0,565.0,2.0,1091.0,Willy Santantines,...,0,0,0,0,1,0,0,0,0,1


In [27]:
df_ii_dummified.isnull().sum().sum()

0

In [28]:
df_ii_dummified.to_csv('../Data/input_ii_dummified.csv', index=False)

### Generate Custom Features

In [69]:
df_ii_customized = df_ii.copy()
df_ii_customized

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,Maham,Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,Juanna,Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,Altark,Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,Solam,Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,Willy,Santantines,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,Gravior,Noxnuther,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,Kurta,Mondalley,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,Fayey,Connon,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,Celeon,Hontichre,E,608,S


In [70]:
df_ii_customized.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'FirstName', 'LastName', 'CabinSector',
       'CabinRoom', 'CabinClass'],
      dtype='object')

In [71]:
df_ii_customized.drop(columns=['Cabin', 'Name', 'FirstName'], inplace=True)
df_ii_customized.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'LastName', 'CabinSector', 'CabinRoom', 'CabinClass'],
      dtype='object')

#### Determine how many people are associated on PassengerId number (assumed family or close association)

In [72]:
to_encode = ['HomePlanet', 
             'CryoSleep', 
             'Destination', 
             'VIP', 
             'LastName',
             'CabinSector',
             'CabinRoom',
             'CabinClass'
            ]

In [73]:
id_associations = {}
for base_id in [id.split('_')[0] for id in df_ii_customized['PassengerId']]:
    if base_id not in id_associations:
        id_associations[base_id] = 1
    else:
        id_associations[base_id] += 1
id_associations

# create feature to attribute this value
def get_assoc_value(row):
    return id_associations[row.split('_')[0]]

df_ii_customized['Associates'] = df_ii_customized['PassengerId'].apply(get_assoc_value)

In [74]:
df_ii_customized

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,LastName,CabinSector,CabinRoom,CabinClass,Associates
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,Ofracculy,B,0,P,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,Vines,F,0,S,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,Susent,A,0,S,2
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,Susent,A,0,S,2
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,Santantines,F,1,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0,Noxnuther,A,98,P,1
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0,Mondalley,G,1499,S,1
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1,Connon,G,1500,S,1
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0,Hontichre,E,608,S,2


#### Add total spend feature

In [75]:
df_ii_customized.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'LastName', 'CabinSector', 'CabinRoom', 'CabinClass',
       'Associates'],
      dtype='object')

In [76]:
df_ii_customized['TotalSpend'] = df_ii_customized.loc[:, 'RoomService':'VRDeck'].sum(axis=1)

In [77]:
df_ii_customized.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'LastName', 'CabinSector', 'CabinRoom', 'CabinClass',
       'Associates', 'TotalSpend'],
      dtype='object')

### Encode Categoricals

In [78]:
def encode_categorical(df, column_name):
    labels = sorted(df[column_name].unique())
    decode_key = [(label, idx) for idx, label in enumerate(sorted(df[column_name].unique()))]
    n = 0
    for label in labels:
        df.loc[df[column_name] == label, column_name] = n
        n += 1
    return decode_key

In [79]:
def create_mixed_categorical(df, col_name_1, col_name_2):
    # Combines two encoded categorical features
    mixed_column = col_name_1 + '_X_' + col_name_2
    df[mixed_column] = col_name_1 + df[col_name_1].astype(str) + col_name_2 + df[col_name_2].astype(str)
    
    return encode_categorical(df, mixed_column)

In [80]:
def plot_hue_hist(df, target_name, hue_name, ax=None, kde=True):
    g = sns.histplot(x=target_name, data=df, hue=hue_name, discrete=True, ax=ax, kde=kde)
    
def plot_hue_kde(df, target_name, hue_name, ax=None, fill=True):
    g = sns.kdeplot(x=target_name, data=df, hue=hue_name, ax=ax)

In [81]:
sorted(df_ii_customized['HomePlanet'].unique())

['*MISSING*', 'Earth', 'Europa', 'Mars']

#### Create master dictionary that contains all unique labels and their associated encoding

In [82]:
test_dict

{1: 'Hi'}

In [83]:
test_dict2 = {2:'there',3:'big',4:'fella'}

In [84]:
test_dict3 = test_dict.copy()
test_dict3

{1: 'Hi'}

In [85]:
test_dict3.update(test_dict2)
test_dict3

{1: 'Hi', 2: 'there', 3: 'big', 4: 'fella'}

In [86]:
encoder_dict = {}

In [87]:
for feature in to_encode:
    key = encode_categorical(df_ii_customized, feature)
    encoder_dict.update(key)

In [88]:
encoder_dict

{'*MISSING*': 0,
 'Earth': 1,
 'Europa': 2,
 'Mars': 3,
 'False': 1,
 'True': 2,
 '55 Cancri e': 1,
 'PSO J318.5-22': 2,
 'TRAPPIST-1e': 3,
 'Acobson': 1,
 'Acobsond': 2,
 'Adavisons': 3,
 'Adkinson': 4,
 'Admingried': 5,
 'Ageurante': 6,
 'Aginge': 7,
 'Ailled': 8,
 'Aillyber': 9,
 'Aiming': 10,
 'Ainatint': 11,
 'Aindlylid': 12,
 'Ainserfle': 13,
 'Airdring': 14,
 'Aivering': 15,
 'Alaring': 16,
 'Alaxed': 17,
 'Alberts': 18,
 'Alcemblery': 19,
 'Alenat': 20,
 'Alenter': 21,
 'Alentonway': 22,
 'Alest': 23,
 'Alfordonard': 24,
 'Alindiveng': 25,
 'Alldson': 26,
 'Aloubtled': 27,
 'Alshipson': 28,
 'Alvasquez': 29,
 'Alvercal': 30,
 'Alvesssidy': 31,
 'Ambleetive': 32,
 'Ambleeve': 33,
 'Amblereld': 34,
 'Ametic': 35,
 'Amincrerus': 36,
 'Amonsmane': 37,
 'Amonysidle': 38,
 'Amoutake': 39,
 'Amsive': 40,
 'Amspring': 41,
 'Anake': 42,
 'Anche': 43,
 'Ancontaked': 44,
 'Ancy': 45,
 'Andackson': 46,
 'Anderking': 47,
 'Andley': 48,
 'Ane': 49,
 'Aneetle': 50,
 'Aneter': 51,
 'Anindery':

In [89]:
df_ii_customized.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,LastName,CabinSector,CabinRoom,CabinClass,Associates,TotalSpend
0,0001_01,2,1,3,39.0,1,0.0,0.0,0.0,0.0,0.0,0,1432,2,1,1,1,0.0
1,0002_01,1,1,3,24.0,1,109.0,9.0,25.0,549.0,44.0,1,2110,6,1,2,1,736.0
2,0003_01,2,1,3,58.0,2,43.0,3576.0,0.0,6715.0,49.0,0,1991,1,1,2,2,10383.0
3,0003_02,2,1,3,33.0,1,0.0,1283.0,371.0,3329.0,193.0,0,1991,1,1,2,2,5176.0
4,0004_01,1,1,3,16.0,1,303.0,70.0,151.0,565.0,2.0,1,1779,6,2,2,1,1091.0


#### Generate Mixed Features

In [90]:
to_mix = [['HomePlanet', 'Destination'],
          ['HomePlanet_X_Destination', 'CryoSleep'],
          ['CabinSector', 'CabinRoom'],
          ['CabinSector_X_CabinRoom', 'CabinClass']
         ]

In [91]:
for pair in to_mix:
    key = create_mixed_categorical(df_ii_customized, pair[0], pair[1])
    encoder_dict.update(key)

In [97]:
list(encoder_dict.keys())[-10:]

['CabinSector_X_CabinRoom995CabinClass2',
 'CabinSector_X_CabinRoom996CabinClass1',
 'CabinSector_X_CabinRoom997CabinClass1',
 'CabinSector_X_CabinRoom997CabinClass2',
 'CabinSector_X_CabinRoom998CabinClass2',
 'CabinSector_X_CabinRoom999CabinClass2',
 'CabinSector_X_CabinRoom99CabinClass1',
 'CabinSector_X_CabinRoom99CabinClass2',
 'CabinSector_X_CabinRoom9CabinClass1',
 'CabinSector_X_CabinRoom9CabinClass2']

In [98]:
df_ii_customized.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,LastName,CabinSector,CabinRoom,CabinClass,Associates,TotalSpend,HomePlanet_X_Destination,HomePlanet_X_Destination_X_CryoSleep,CabinSector_X_CabinRoom,CabinSector_X_CabinRoom_X_CabinClass
0,0001_01,2,1,3,39.0,1,0.0,0.0,0.0,0.0,...,1432,2,1,1,1,0.0,11,4,98,6543
1,0002_01,1,1,3,24.0,1,109.0,9.0,25.0,549.0,...,2110,6,1,2,1,736.0,7,36,1489,799
2,0003_01,2,1,3,58.0,2,43.0,3576.0,0.0,6715.0,...,1991,1,1,2,2,10383.0,11,4,1,1641
3,0003_02,2,1,3,33.0,1,0.0,1283.0,371.0,3329.0,...,1991,1,1,2,2,5176.0,11,4,1,1641
4,0004_01,1,1,3,16.0,1,303.0,70.0,151.0,565.0,...,1779,6,2,2,1,1091.0,7,36,2295,2124


In [112]:
df_ii_customized.dtypes[1:] == 'object'

HomePlanet                               True
CryoSleep                                True
Destination                              True
Age                                     False
VIP                                      True
RoomService                             False
FoodCourt                               False
ShoppingMall                            False
Spa                                     False
VRDeck                                  False
Transported                             False
LastName                                 True
CabinSector                              True
CabinRoom                                True
CabinClass                               True
Associates                              False
TotalSpend                              False
HomePlanet_X_Destination                 True
HomePlanet_X_Destination_X_CryoSleep     True
CabinSector_X_CabinRoom                  True
CabinSector_X_CabinRoom_X_CabinClass     True
dtype: bool

In [115]:
df_ii_customized.select_dtypes(include=['object']).columns.to_list()

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Destination',
 'VIP',
 'LastName',
 'CabinSector',
 'CabinRoom',
 'CabinClass',
 'HomePlanet_X_Destination',
 'HomePlanet_X_Destination_X_CryoSleep',
 'CabinSector_X_CabinRoom',
 'CabinSector_X_CabinRoom_X_CabinClass']

In [106]:
df_ii_customized.isnull().sum()

PassengerId                             0
HomePlanet                              0
CryoSleep                               0
Destination                             0
Age                                     0
VIP                                     0
RoomService                             0
FoodCourt                               0
ShoppingMall                            0
Spa                                     0
VRDeck                                  0
Transported                             0
LastName                                0
CabinSector                             0
CabinRoom                               0
CabinClass                              0
Associates                              0
TotalSpend                              0
HomePlanet_X_Destination                0
HomePlanet_X_Destination_X_CryoSleep    0
CabinSector_X_CabinRoom                 0
CabinSector_X_CabinRoom_X_CabinClass    0
dtype: int64

In [99]:
df_ii_customized.to_csv('../Data/input_ii_customized.csv', index=False)

In [101]:
dump(encoder_dict, '../Output/encoder_dict.pkl')

['../Output/encoder_dict.pkl']