In [22]:
import pandas as pd
import numpy as np  
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier



In [23]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [24]:
feature_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [25]:
train_df[['Deck', 'Num', 'Side']] = train_df['Cabin'].str.split("/", expand=True)
test_df[['Deck', 'Num', 'Side']] = test_df['Cabin'].str.split("/", expand=True)

In [26]:
train_df['Num'] = pd.to_numeric(train_df['Num'], errors='coerce')  
test_df['Num'] = pd.to_numeric(test_df['Num'], errors='coerce')

In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  Deck          8494 non-null   object 
 15  Num           8494 non-null   float64
 16  Side          8494 non-null   object 
dtypes: bool(1), float64(7), object(9)
memory usage: 1.1+ MB


In [28]:
X_train = train_df[feature_cols + ['Deck','Num','Side']]
y_train = train_df['Transported']
X_test = test_df[feature_cols + ['Deck','Num','Side']]

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Deck          8494 non-null   object 
 12  Num           8494 non-null   float64
 13  Side          8494 non-null   object 
dtypes: float64(7), object(7)
memory usage: 950.9+ KB


In [30]:
X_train.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Deck            199
Num             199
Side            199
dtype: int64

In [31]:
print(train_df[X_train.columns].dtypes)


HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Deck             object
Num             float64
Side             object
dtype: object


In [32]:
cat = ['HomePlanet', 'CryoSleep', 'Deck', 'Side', 'Destination', 'VIP']
for col in cat:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0]) 
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0]) 

  train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
  test_df[col] = test_df[col].fillna(test_df[col].mode()[0])


In [33]:
train_df.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Deck              0
Num             199
Side              0
dtype: int64

In [34]:
X_train.drop(columns=['Cabin'], inplace=True)
# test_df.drop(columns=['Cabin'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['Cabin'], inplace=True)


In [35]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

In [36]:
FillValues = ColumnTransformer([
    ('SimpleImputer_Numerical', SimpleImputer(strategy='mean'), 
     ['Num', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']),

    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['Deck', 'Side', 'HomePlanet', 'Destination']),
    
    ('OrdinalEncoder', OrdinalEncoder(), ['VIP']),
    ('StandardScaler', StandardScaler(), ['Num', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
], remainder='passthrough')  

In [37]:
tra = FillValues.fit_transform(X_train)

In [38]:
tra[:5]

array([[0.0, 39.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
        0.0, 0.0, -1.1729663174046185, 0.7020948248098347,
        -0.33702544483852215, -0.2842737687877877, -0.2873167272969758,
        -0.27373585397312594, -0.2660977053944768, False],
       [0.0, 24.0, 109.0, 9.0, 25.0, 549.0, 44.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 1.0, 0.0, 0.0, -1.1729663174046185, -0.3332325821119825,
        -0.17352834651501453, -0.27868854450525776, -0.2459712340863275,
        0.2092672125090491, -0.22769155475646868, False],
       [0.0, 58.0, 43.0, 3576.0, 0.0, 6715.0, 49.0, 1.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
        0.0, 1.0, 0.0, 1.0, -1.1729663174046185, 2.013509540244136,
        -0.2725265895365879, 1.9349220128041078, -0.2873167272969758,
        5.634033893618504, -0.223327

In [39]:
pipe = Pipeline([
    ("Encodings",FillValues),
    ('model',RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))
])

In [40]:
pipe.fit(X_train, y_train_encoded)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [43]:
y_pred = pipe.predict(X_test)
print(y_pred)

[1 0 1 ... 1 1 1]


In [46]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'], 
    'Transported': y_pred  
})

submission.to_csv("submission.csv", index=False)

print("Submission file saved successfully!")


Submission file saved successfully!
