In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
train_df = pd.read_csv('../train.csv', index_col='PassengerId')
test_df = pd.read_csv('../test.csv', index_col='PassengerId')

In [3]:
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)

In [4]:
train_df['Transported'].replace( {False: 0, True: 1} )

PassengerId
0001_01    0
0002_01    1
0003_01    0
0003_02    0
0004_01    1
          ..
9276_01    0
9278_01    0
9279_01    1
9280_01    0
9280_02    1
Name: Transported, Length: 8693, dtype: object

In [5]:
train_df[['deck','num', 'side']] = train_df['Cabin'].str.split('/', expand=True)
test_df[['deck','num', 'side']] = test_df['Cabin'].str.split('/', expand=True)

train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

In [6]:
object_columns = [column for column in train_df.columns if train_df[column].dtype == 'object' or train_df[column].dtype == 'category']
numeric_columns = [column for column in train_df.columns if train_df[column].dtype == 'float64']

print(f'Object cols -- {object_columns}')
print(f'Numeric cols -- {numeric_columns}')

Object cols -- ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'num', 'side']
Numeric cols -- ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [7]:
expense_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_df['SumSpends'] = train_df[expense_columns].sum(axis=1)
test_df['SumSpends'] = test_df[expense_columns].sum(axis=1)

In [8]:
null_cols = train_df.isnull().sum().sort_values(ascending=False)
null_cols = list(null_cols[null_cols>1].index)
null_cols

['CryoSleep',
 'ShoppingMall',
 'VIP',
 'HomePlanet',
 'deck',
 'num',
 'side',
 'VRDeck',
 'FoodCourt',
 'Spa',
 'Destination',
 'RoomService',
 'Age']

In [9]:
print(f'Train DF shape: {train_df.shape}')
print(f'Test DF shape: {test_df.shape}')

Train DF shape: (8693, 15)
Test DF shape: (4277, 14)


In [10]:
from sklearn.preprocessing import OrdinalEncoder

oc = OrdinalEncoder()

df_for_encode = pd.concat([train_df, test_df])

df_for_encode[object_columns] = df_for_encode[object_columns].astype('category')

df_for_encode[object_columns] = oc.fit_transform(df_for_encode[object_columns])

del train_df, test_df

train_df = df_for_encode.iloc[:8693, :]
test_df = df_for_encode.iloc[8693: , :]

del df_for_encode

test_df.drop('Transported', inplace=True, axis=1)

In [11]:
print(f'Train DF shape: {train_df.shape}')
print(f'Test DF shape: {test_df.shape}')

Train DF shape: (8693, 15)
Test DF shape: (4277, 14)


In [12]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


ct = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), null_cols)])
    
train_df[null_cols] = ct.fit_transform(train_df[null_cols])
test_df[null_cols] = ct.fit_transform(test_df[null_cols])

In [13]:
X = train_df.copy()
y = X.pop('Transported')
train_df.copy()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,SumSpends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,0.0,0.0
0002_01,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,True,5.0,0.0,1.0,736.0
0003_01,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,False,0.0,0.0,1.0,10383.0
0003_02,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,False,0.0,0.0,1.0,5176.0
0004_01,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,True,5.0,1.0,1.0,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,1.0,0.0,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,False,0.0,1872.0,0.0,8536.0
9278_01,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,False,6.0,556.0,1.0,0.0
9279_01,0.0,0.0,2.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,True,6.0,559.0,1.0,1873.0
9280_01,1.0,0.0,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,False,4.0,1460.0,1.0,4637.0


In [14]:
from catboost import CatBoostClassifier

best_features = ['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'deck', 'side', 'SumSpends']

In [15]:
model = CatBoostClassifier(verbose=False, eval_metric='Accuracy', iterations=492, learning_rate=0.05513269656476656, depth=6)
model.fit(X[best_features], y)
prediction = model.predict(test_df[best_features])

In [16]:
final = pd.DataFrame()
final.index = test_df.index
final['Transported'] = prediction
final['Transported'].replace(0, False, inplace=True)
final['Transported'].replace(1, True, inplace=True)
final.to_csv('prediction.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final['Transported'].replace(0, False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final['Transported'].replace(1, True, inplace=True)
