In [1]:
import pandas as pd
import os
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle
import numpy as np
import category_encoders as ce

In [2]:
df_cb = pd.read_csv('results_cb.csv')
df_rf = pd.read_csv('results_rf.csv')
df_lr = pd.read_csv('results_lr.csv')

display(df_cb, df_rf, df_lr)

Unnamed: 0,TrainAcc,TestAcc
0,0.494101,0.533742


Unnamed: 0,TrainAcc,TestAcc
0,0.997955,0.705993


Unnamed: 0,TrainAcc,TestAcc
0,0.731792,0.712128


In [3]:
index = ['CatBoost', 'RandomForest', 'LogisticRegression']
cols = ['TrainAcc', 'TestAcc']
df = pd.DataFrame(columns=cols, index=index)
df

Unnamed: 0,TrainAcc,TestAcc
CatBoost,,
RandomForest,,
LogisticRegression,,


In [4]:
df.loc['CatBoost'] = df_cb.loc[0]
df.loc['RandomForest'] = df_rf.loc[0]
df.loc['LogisticRegression'] = df_lr.loc[0]
df

Unnamed: 0,TrainAcc,TestAcc
CatBoost,0.494101,0.533742
RandomForest,0.997955,0.705993
LogisticRegression,0.731792,0.712128


In [5]:
df.to_csv('composed_result.csv')

In [6]:
os.remove('results_cb.csv')
os.remove('results_rf.csv')
os.remove('results_lr.csv')

In [7]:
best = df.index[df['TestAcc'] == df['TestAcc'].max()].to_list()[0]
best

'LogisticRegression'

In [8]:
if best == 'LogisticRegression':
    best_file = 'model_lr.sav'
elif best == 'RandomForest':
    best_file = 'model_rf.sav'
elif best == 'CatBoost':
    best_file = 'model_cb.sav'

In [9]:
model = pickle.load(open(best_file, 'rb'))

In [10]:
test_data = pd.read_csv('test_new.csv')
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,NumCabin,Side
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3.0,S
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4.0,S
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0.0,S
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1.0,S
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,1496.0,S
4273,9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,F,442.0,S
4274,9271_01,Mars,True,55 Cancri e,26.0,False,0.0,0.0,0.0,0.0,0.0,D,296.0,P
4275,9273_01,Europa,False,TRAPPIST-1e,26.0,False,0.0,2680.0,0.0,0.0,523.0,D,297.0,P


In [11]:
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   object 
 2   CryoSleep     4277 non-null   bool   
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   bool   
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
 11  Deck          4277 non-null   object 
 12  NumCabin      4277 non-null   float64
 13  Side          4277 non-null   object 
dtypes: bool(2), float64(7), object(5)
memory usage: 409.5+ KB


In [12]:
cols = list(test_data.columns)
cols.remove('PassengerId')
cols.remove('FoodCourt')
cols.remove('RoomService')
cols.remove('ShoppingMall')
cols.remove('Spa')
cols.remove('VRDeck')


In [13]:
ce_encoder = ce.CountEncoder()

In [14]:
test_data['Transported'] = model.predict(ce_encoder.fit_transform(test_data[cols]))
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,NumCabin,Side,Transported
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3.0,S,True
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4.0,S,False
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0.0,S,True
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1.0,S,False
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5.0,S,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,1496.0,S,True
4273,9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,F,442.0,S,False
4274,9271_01,Mars,True,55 Cancri e,26.0,False,0.0,0.0,0.0,0.0,0.0,D,296.0,P,True
4275,9273_01,Europa,False,TRAPPIST-1e,26.0,False,0.0,2680.0,0.0,0.0,523.0,D,297.0,P,False


In [15]:
filenames = ['model_lr.sav', 'model_cb.sav', 'model_rf.sav']
filenames.remove(best_file)
for file in filenames:
    if os.path.isfile(file):
        os.remove(file)

In [16]:
preds = test_data[['PassengerId', 'Transported']]
preds

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [44]:
preds.to_csv('preds.csv', index=False)