In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import dump, load

In [61]:
kaggle = pd.read_csv('../Data/test.csv')
kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [96]:
kaggle.shape

(4277, 18)

In [62]:
kaggle.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

#### Leverage iterative imputer to impute missing numerical values of the kaggle dataset

In [76]:
imputer = load(open('../Data/iterative_imputer_mean.pkl', 'rb'))

In [64]:
object_cols = kaggle.select_dtypes(include='object').columns

In [65]:
for col in object_cols:
    kaggle[col] = kaggle[col].fillna('*MISSING*')
    
kaggle.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age              91
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name              0
dtype: int64

In [66]:
missing_names = kaggle[kaggle['Name'] == '*MISSING*'].index

In [67]:
valid_names = [ind for ind in kaggle.index if ind not in missing_names]

In [68]:
for ind in missing_names:
    kaggle.loc[ind, 'FirstName'] = kaggle.loc[ind, 'LastName'] = '*MISSING*'
for ind in valid_names:
    split_name = kaggle.loc[ind, 'Name'].split()
    kaggle.loc[ind, 'FirstName'] = split_name[0]
    kaggle.loc[ind, 'LastName'] = split_name[1]

kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,FirstName,LastName
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,Lerome,Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,Meratz,Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,Brence,Harperez


In [69]:
missing_cabins = kaggle[kaggle['Cabin'] == '*MISSING*'].index

In [70]:
valid_cabins = [ind for ind in kaggle.index if ind not in missing_cabins]

In [71]:
for ind in missing_cabins:
    kaggle.loc[ind, 'CabinSector'] = kaggle.loc[ind, 'CabinRoom'] = kaggle.loc[ind, 'CabinClass'] = '*MISSING*'
for ind in valid_cabins:
    split_cabin = kaggle.loc[ind, 'Cabin'].split('/')
    kaggle.loc[ind, 'CabinSector'] = split_cabin[0]
    kaggle.loc[ind, 'CabinRoom'] = split_cabin[1]
    kaggle.loc[ind, 'CabinClass'] = split_cabin[2]

kaggle.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,FirstName,LastName,CabinSector,CabinRoom,CabinClass
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,Carsoning,G,3,S
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,Lerome,Peckers,F,4,S
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,Unhearfus,C,0,S
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,Meratz,Caltilter,C,1,S
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,Brence,Harperez,F,5,S


In [72]:
to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinSector', 'CabinClass']
to_drop = ['FirstName', 'LastName', 'CabinRoom']

In [97]:
kaggle_intermed = kaggle.copy()

kaggle_intermed = kaggle_intermed.drop(columns=(['PassengerId', 'Cabin', 'Name'] + to_drop))
kaggle_intermed.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinSector,CabinClass
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S


In [98]:
kaggle_intermed['VIP'].value_counts()

False        4110
*MISSING*      93
True           74
Name: VIP, dtype: int64

In [99]:
kaggle_intermed = pd.get_dummies(kaggle_intermed, columns=to_dummy, drop_first=False)
kaggle_intermed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_*MISSING*,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,27.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
1,19.0,0.0,9.0,0.0,2823.0,0.0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,38.0,0.0,6652.0,0.0,181.0,585.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,20.0,10.0,0.0,635.0,0.0,0.0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1


In [100]:
kaggle_intermed['VIP_*MISSING*'].value_counts()

0    4184
1      93
Name: VIP_*MISSING*, dtype: int64

In [101]:
kaggle_intermed['VIP_False'].value_counts()

1    4110
0     167
Name: VIP_False, dtype: int64

In [102]:
kaggle_intermed['VIP_True'].value_counts()

0    4203
1      74
Name: VIP_True, dtype: int64

In [106]:
imputed_kaggle = imputer.transform(kaggle_intermed)
imputed_kaggle.shape

(4277, 38)

In [107]:
null_columns = kaggle_intermed.loc[:, 'Age':'VRDeck'].columns
len(null_columns)

6

In [108]:
flag_columns = [label + '_imputed' for label in null_columns]
flag_columns

['Age_imputed',
 'RoomService_imputed',
 'FoodCourt_imputed',
 'ShoppingMall_imputed',
 'Spa_imputed',
 'VRDeck_imputed']

In [109]:
kaggle_intermed.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_*MISSING*', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'CryoSleep_False', 'CryoSleep_True',
       'CryoSleep_*MISSING*', 'Destination_*MISSING*',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'VIP_False', 'VIP_True', 'VIP_*MISSING*',
       'CabinSector_*MISSING*', 'CabinSector_A', 'CabinSector_B',
       'CabinSector_C', 'CabinSector_D', 'CabinSector_E', 'CabinSector_F',
       'CabinSector_G', 'CabinSector_T', 'CabinClass_*MISSING*',
       'CabinClass_P', 'CabinClass_S'],
      dtype='object')

In [110]:
len(list(kaggle_intermed.columns))

32

In [111]:
imputed_kaggle = pd.DataFrame(imputed_kaggle, columns=(list(kaggle_intermed.columns) + flag_columns))
imputed_kaggle.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_*MISSING*,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S,Age_imputed,RoomService_imputed,FoodCourt_imputed,ShoppingMall_imputed,Spa_imputed,VRDeck_imputed
0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19.0,0.0,9.0,0.0,2823.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38.0,0.0,6652.0,0.0,181.0,585.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.0,10.0,0.0,635.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
imputed_kaggle['VIP_False'].value_counts()

1.0    4110
0.0     167
Name: VIP_False, dtype: int64

In [114]:
for variable in null_columns:
    kaggle[variable] = imputed_kaggle[variable]

#### Numerical values for the kaggle dataset have been updated with the imputed values

In [115]:
kaggle.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
FirstName       0
LastName        0
CabinSector     0
CabinRoom       0
CabinClass      0
dtype: int64

In [122]:
columns_to_dummy = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinSector', 'CabinClass']

In [123]:
kaggle_dummified = pd.get_dummies(kaggle, columns=columns_to_dummy, drop_first=False)
kaggle_dummified.head()

Unnamed: 0,PassengerId,Cabin,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,FirstName,...,CabinSector_B,CabinSector_C,CabinSector_D,CabinSector_E,CabinSector_F,CabinSector_G,CabinSector_T,CabinClass_*MISSING*,CabinClass_P,CabinClass_S
0,0013_01,G/3/S,27.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,Nelly,...,0,0,0,0,0,1,0,0,0,1
1,0018_01,F/4/S,19.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,Lerome,...,0,0,0,0,1,0,0,0,0,1
2,0019_01,C/0/S,31.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,Sabih,...,0,1,0,0,0,0,0,0,0,1
3,0021_01,C/1/S,38.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,Meratz,...,0,1,0,0,0,0,0,0,0,1
4,0023_01,F/5/S,20.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,Brence,...,0,0,0,0,1,0,0,0,0,1


In [128]:
model = load(open('../Output/logreg_cv_ii.joblib', 'rb'))

In [129]:
model.feature_names

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_*MISSING*',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_*MISSING*',
 'VIP_False',
 'VIP_True',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [130]:
training_columns = model.feature_names
training_columns

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_*MISSING*',
 'CryoSleep_False',
 'CryoSleep_True',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_*MISSING*',
 'VIP_False',
 'VIP_True',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [131]:
test_columns = kaggle_dummified.columns.tolist()
test_columns

['PassengerId',
 'Cabin',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'FirstName',
 'LastName',
 'CabinRoom',
 'HomePlanet_*MISSING*',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_False',
 'CryoSleep_True',
 'CryoSleep_*MISSING*',
 'Destination_*MISSING*',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_False',
 'VIP_True',
 'VIP_*MISSING*',
 'CabinSector_*MISSING*',
 'CabinSector_A',
 'CabinSector_B',
 'CabinSector_C',
 'CabinSector_D',
 'CabinSector_E',
 'CabinSector_F',
 'CabinSector_G',
 'CabinSector_T',
 'CabinClass_*MISSING*',
 'CabinClass_P',
 'CabinClass_S']

In [132]:
# for i in range(len(training_columns)):
#     if training_columns[i] in test_columns:
#         if training_columns[i] != test_columns[i]:
#             test_columns.remove(training_columns[i])
#             test_columns.insert(i, training_columns[i])
#             print('Moved column ' + training_columns[i] + ' in kaggle dataset')
        
#     else:
#         kaggle_dummified[training_columns[i]] = 0
#         print('Created new column for ' + training_columns[i] + ' in kaggle dataset')
#         test_columns.insert(i, training_columns[i])
    
# print(test_columns == training_columns)

In [133]:
# to_drop = test_columns[len(training_columns):]
# to_drop

In [134]:
def update_cols(df, target_cols):
    res = pd.DataFrame()
    for col in target_cols:
        if col in df:
            res[col] = df[col]
        else:
            res[col] = np.NaN
    return res

In [135]:
kaggle_matched = update_cols(kaggle_dummified, training_columns)

In [136]:
kaggle_matched.isnull().sum()

Age                             0
RoomService                     0
FoodCourt                       0
ShoppingMall                    0
Spa                             0
VRDeck                          0
TotalSpend                   4277
HomePlanet_Earth                0
HomePlanet_Europa               0
HomePlanet_Mars                 0
CryoSleep_*MISSING*             0
CryoSleep_False                 0
CryoSleep_True                  0
Destination_*MISSING*           0
Destination_55 Cancri e         0
Destination_PSO J318.5-22       0
Destination_TRAPPIST-1e         0
VIP_*MISSING*                   0
VIP_False                       0
VIP_True                        0
CabinSector_*MISSING*           0
CabinSector_A                   0
CabinSector_B                   0
CabinSector_C                   0
CabinSector_D                   0
CabinSector_E                   0
CabinSector_F                   0
CabinSector_G                   0
CabinSector_T                   0
CabinClass_*MI

In [137]:
kaggle_matched['TotalSpend'] = kaggle_matched.iloc[:, 1:6].sum(axis=1)

In [138]:
kaggle_matched.isnull().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
TotalSpend                   0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_*MISSING*          0
CryoSleep_False              0
CryoSleep_True               0
Destination_*MISSING*        0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_*MISSING*                0
VIP_False                    0
VIP_True                     0
CabinSector_*MISSING*        0
CabinSector_A                0
CabinSector_B                0
CabinSector_C                0
CabinSector_D                0
CabinSector_E                0
CabinSector_F                0
CabinSector_G                0
CabinSector_T                0
CabinClass_*MISSING*         0
CabinClass_P                 0
CabinClass_S                 0
dtype: i

In [139]:
res = model.predict(kaggle_matched)

In [143]:
res_df = pd.DataFrame(res, index=kaggle['PassengerId'], columns=['Transported'])

In [149]:
res_df['Transported'] = res_df['Transported'].astype(bool)

In [151]:
res_df.to_csv('../Output/submission_1.csv')

In [152]:
pd.read_csv('../Output/submission_1.csv', index_col='PassengerId')

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,False
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,False
9269_01,True
9271_01,False
9273_01,True


Kaggle Score: .60860

#### Random Forrest Predictions

In [153]:
model_rf_gs = load(open('../Output/rf_gs_ii.joblib', 'rb'))

In [154]:
res_rf_gs = model_rf_gs.predict(kaggle_matched)

In [155]:
res_rf_gs_df = pd.DataFrame(res_rf_gs, index=kaggle['PassengerId'], columns=['Transported'])

In [156]:
res_rf_gs_df['Transported'] = res_rf_gs_df['Transported'].astype(bool)

In [159]:
res_rf_gs_df.to_csv('../Output/submission_rf_gs.csv')

In [160]:
pd.read_csv('../Output/submission_rf_gs.csv', index_col='PassengerId')

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


Kaggle Score: .72223

#### AdaBoost Predictions

In [163]:
lable = '../Output/submission_rf_gs.csv'.split('/')[-1]

In [166]:
lable.rsplit('.', maxsplit=1)[0]

'submission_rf_gs'

In [172]:
def run_new_model(model_path):
    submission_name = model_path.split('/')[-1].rsplit('.', maxsplit=1)[0]
    model = load(open(model_path, 'rb'))
    model_res = model.predict(kaggle_matched) # Just leaving this hardcoded for now and specific to this processed input
    model_res_df = pd.DataFrame(model_res, index=kaggle['PassengerId'], columns=['Transported'])
    model_res_df['Transported'] = model_res_df['Transported'].astype(bool)
    model_res_df.to_csv(f'../Output/submission_{submission_name}.csv')
    return pd.read_csv(f'../Output/submission_{submission_name}.csv', index_col='PassengerId')

In [176]:
gs_ab_res = run_new_model('../Output/gs_ab_ii.joblib')
gs_ab_res

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


Kaggle Score: .69043

#### Gradient Boost Predictions

In [177]:
gb_gs_res = run_new_model('../Output/gb_gs_ii.joblib')

In [178]:
gb_gs_res

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


Kaggle Score: .69043