In [1]:
import pandas as pd
import numpy as np
from pandas.core.dtypes.common import is_numeric_dtype
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
submission_df = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")


In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
columns_missing = ['HomePlanet', 'CryoSleep', 'Destination',  'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Manually impute missing values
for column in columns_missing:
    if column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
        # For categorical columns, fill missing values with the most frequent value
        train[column].fillna(train[column].mode()[0], inplace=True)
        test[column].fillna(test[column].mode()[0], inplace=True)
    else:
        # For numerical columns, fill missing values with the mean
        train[column].fillna(train[column].mean(), inplace=True)
        test[column].fillna(test[column].mean(), inplace=True)

In [5]:
train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64

In [6]:
test.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           100
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name             94
dtype: int64

In [7]:
x = train.drop(["Cabin","Name","PassengerId","Transported"],axis=1)
y= train[["Transported"]]
z = train[["PassengerId"]]

In [8]:
x.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [9]:
test_x = test.drop(["Cabin","Name","PassengerId"],axis=1)
test_y= train[["PassengerId"]]

In [10]:
test_x.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0


In [11]:
test_y.head()

Unnamed: 0,PassengerId
0,0001_01
1,0002_01
2,0003_01
3,0003_02
4,0004_01


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [13]:
for col in x.columns:
    if x[col].dtype== np.float64:
        continue
    else:
        x[col] = le.fit_transform(x[col])
for col in test_x.columns:
    if test_x[col].dtype== np.float64:
        continue
    else:
        test_x[col] = le.fit_transform(test_x[col])

In [14]:
x.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0


In [15]:
test_x.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,1,2,27.0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,2,19.0,0,0.0,9.0,0.0,2823.0,0.0
2,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0.0
3,1,0,2,38.0,0,0.0,6652.0,0.0,181.0,585.0
4,0,0,2,20.0,0,10.0,0.0,635.0,0.0,0.0


In [16]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   int64  
 1   CryoSleep     4277 non-null   int64  
 2   Destination   4277 non-null   int64  
 3   Age           4277 non-null   float64
 4   VIP           4277 non-null   int64  
 5   RoomService   4277 non-null   float64
 6   FoodCourt     4277 non-null   float64
 7   ShoppingMall  4277 non-null   float64
 8   Spa           4277 non-null   float64
 9   VRDeck        4277 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 334.3 KB


In [17]:
for col in x.columns:
    if x[col].dtype == np.float64:
        continue
    else:
        one = pd.get_dummies(x[col], drop_first=True, prefix=f'One_Hot_{col}')
        x.drop(x[[col]], axis=1,inplace=True)
        x=pd.concat([x,one], axis=1)

In [18]:
x.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,One_Hot_HomePlanet_1,One_Hot_HomePlanet_2,One_Hot_CryoSleep_1,One_Hot_Destination_1,One_Hot_Destination_2,One_Hot_VIP_1
0,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False
1,24.0,109.0,9.0,25.0,549.0,44.0,False,False,False,False,True,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,True,False,False,False,True,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,True,False,False,False,True,False
4,16.0,303.0,70.0,151.0,565.0,2.0,False,False,False,False,True,False


In [19]:
for col in test_x.columns:
    if test_x[col].dtype == np.float64:
        continue
    else:
        one = pd.get_dummies(test_x[col], drop_first=True, prefix=f'One_Hot_{col}')
        test_x.drop(test_x[[col]], axis=1,inplace=True)
        test_x=pd.concat([test_x,one], axis=1)

In [20]:
test_x.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,One_Hot_HomePlanet_1,One_Hot_HomePlanet_2,One_Hot_CryoSleep_1,One_Hot_Destination_1,One_Hot_Destination_2,One_Hot_VIP_1
0,27.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,True,False
1,19.0,0.0,9.0,0.0,2823.0,0.0,False,False,False,False,True,False
2,31.0,0.0,0.0,0.0,0.0,0.0,True,False,True,False,False,False
3,38.0,0.0,6652.0,0.0,181.0,585.0,True,False,False,False,True,False
4,20.0,10.0,0.0,635.0,0.0,0.0,False,False,False,False,True,False


In [21]:
x.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,One_Hot_HomePlanet_1,One_Hot_HomePlanet_2,One_Hot_CryoSleep_1,One_Hot_Destination_1,One_Hot_Destination_2,One_Hot_VIP_1
0,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False
1,24.0,109.0,9.0,25.0,549.0,44.0,False,False,False,False,True,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,True,False,False,False,True,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,True,False,False,False,True,False
4,16.0,303.0,70.0,151.0,565.0,2.0,False,False,False,False,True,False


In [22]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(x,y, test_size= .2, random_state=41)

In [23]:
from sklearn.ensemble import AdaBoostClassifier

AD = AdaBoostClassifier( n_estimators=100,
                        random_state= 40,
                       )
AD.fit (xtrain, ytrain)
AD.score (xtrain, ytrain)


0.8001150417026172

In [24]:
AD.score (xtest, ytest)

0.7889591719378953

In [25]:
y_pred = AD.predict(test_x)

In [26]:
y_pred_bool = y_pred >= 0.5
submission = pd.DataFrame({ 'PassengerId': submission_df['PassengerId'],'Transported':y_pred_bool})
submission.to_csv('submission.csv', index=False)