<a href="https://www.kaggle.com/code/stevenmpro/spaceship-titanic?scriptVersionId=104510791" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

This notebook is created to respond to the competition and improve my knowledge of how kaggle works.

In order to use all functions, this notebook is based on Arif Enes Aydın's. 

Notebook inspired from https://www.kaggle.com/code/arifenesaydn/spaceship-titanic

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
x_train = train.drop(columns='Transported')
y_train = train.loc[:, 'Transported']
x_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [5]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
# Cabin
x_train['Cabin'] = x_train['Cabin'].str[-1]
x_test['Cabin'] = x_test['Cabin'].str[-1]

In [7]:
# Impute Categorical

categorical = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
def impute_categorical(source, column_list):
    for column in column_list:
        label_count = int(source[column].nunique())
        mode = source[column].mode()[0]
        source[column].fillna(mode, inplace=True)

impute_categorical(x_train, categorical)
impute_categorical(x_test, categorical)

In [8]:
# Impute Bills

bills = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

def impute_bills(source, column_list):
    for column in column_list:
        source[column].fillna(0.0, inplace=True)
    
impute_bills(x_train, bills)
impute_bills(x_test, bills)

In [9]:
# Impute Age

def impute_age(source):
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(source[['Age']])
    source[['Age']] = imputer.transform(source[['Age']])
    
impute_age(x_train)
impute_age(x_test)

In [10]:
# Sum Bills

def merge_bills(source):
    source['Bills'] = source[bills].sum(axis=1)

merge_bills(x_train)
merge_bills(x_test)

In [11]:
x_train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Bills             0
dtype: int64

In [12]:
# Drop Redundan
pass_id = np.array(x_test['PassengerId'])
to_drop = ['PassengerId', 'Name'] + bills

def drop(source, to_drop):
    return source.drop(columns=to_drop)
    
x_train = drop(x_train, to_drop)  
x_test = drop(x_test, to_drop)

In [13]:
binary_categorical = ['CryoSleep', 'VIP', 'Cabin']
multi_category = list(set(categorical) - set(binary_categorical))

def encode_multi(source, column):
    ohe = OneHotEncoder()
    return pd.DataFrame(ohe.fit_transform(source[[column]]).toarray(), columns=ohe.get_feature_names_out())

for column in multi_category:
    x_train = pd.concat([x_train, encode_multi(x_train, column)], axis=1)
    x_test = pd.concat([x_test, encode_multi(x_test, column)], axis=1)

In [14]:
def encode_binary(source, column):
    le = LabelEncoder()
    return le.fit_transform(source[[column]].values.ravel())

for column in binary_categorical:
    x_train[column] = encode_binary(x_train, column)
    x_test[column] = encode_binary(x_test, column)

In [15]:
x_train = drop(x_train, multi_category)  
x_test = drop(x_test, multi_category)

In [16]:
ss = StandardScaler()
x_train[['Age', 'Bills']] = ss.fit_transform(x_train[['Age', 'Bills']])
x_test[['Age', 'Bills']] = ss.fit_transform(x_test[['Age', 'Bills']])
display(x_train.head(3), x_test.head(3))

Unnamed: 0,CryoSleep,Cabin,Age,VIP,Bills,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,0,0,0.709437,0,-0.514066,0.0,0.0,1.0,0.0,1.0,0.0
1,0,1,-0.336717,0,-0.251479,0.0,0.0,1.0,1.0,0.0,0.0
2,0,1,2.034566,1,3.190333,0.0,0.0,1.0,0.0,1.0,0.0


Unnamed: 0,CryoSleep,Cabin,Age,VIP,Bills,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,1,1,-0.118222,0,-0.503425,0.0,0.0,1.0,1.0,0.0,0.0
1,0,1,-0.688601,0,0.502231,0.0,0.0,1.0,1.0,0.0,0.0
2,1,1,0.166968,0,-0.503425,1.0,0.0,0.0,0.0,1.0,0.0


In [17]:
y_train = LabelEncoder().fit_transform(y_train)

In [18]:
LR = LogisticRegression(solver='liblinear')
KNC = KNeighborsClassifier()
LR.fit(x_train, y_train)
prediction = LR.predict(x_test)

In [19]:
data = {'PassengerId': pass_id, 'Transported': prediction}
pred_df = pd.DataFrame.from_dict(data)
pred_df['Transported'].replace({0: False, 1:True}, inplace=True)
pred_df.to_csv('submission.csv', index=False)