# Subject: space Titanic prediction 
# Link: https://www.kaggle.com/competitions/spaceship-titanic/data?select=sample_submission.csv
# Author: Pierre Mulliez 
# Date start: 28/11/2022
# Description: classify passengers as transported successfully (TRUE) or not (FALSE)

In [13]:
# import packages 
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [14]:
# import data and print summaries
titanic_df = pd.read_csv("data/train.csv")
print(titanic_df.head(5))
print('------------------')
print('dataframe shape: {}'.format(titanic_df.shape))
print('------------------')
print(titanic_df.describe())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
--

In [15]:
# baseline model without feature engineering all TRUE 
test = [1 for count in range(0,len(titanic_df))]
no_feature_eng_error = accuracy_score(titanic_df.loc[:,'Transported'].factorize()[0],test)
print('baseline accuracy without feature engineering is {}'.format(round(no_feature_eng_error,2)))

baseline accuracy without feature engineering is 0.5


In [16]:
# null values 
print(titanic_df.isnull().sum())
#titanic_df = titanic_df.dropna()
titanic_df.loc[titanic_df['Destination'].isnull() == True,:].head(10)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
47,0045_02,Mars,True,F/10/P,,19.0,False,0.0,0.0,0.0,0.0,0.0,Mass Chmad,True
128,0138_02,Earth,False,E/5/P,,34.0,False,0.0,22.0,0.0,564.0,207.0,Monah Gambs,False
139,0152_01,Earth,False,F/32/P,,41.0,False,0.0,0.0,0.0,0.0,607.0,Andan Estron,False
347,0382_01,,False,G/64/P,,23.0,False,348.0,0.0,0.0,4.0,368.0,Blanie Floydendley,False
430,0462_01,Earth,True,G/67/S,,50.0,False,0.0,0.0,0.0,0.0,0.0,Ronia Sosanturney,False
468,0504_03,Europa,True,B/19/S,,24.0,False,0.0,0.0,0.0,0.0,0.0,Sterops Unpasine,True
529,0564_03,Europa,False,B/24/P,,28.0,False,0.0,410.0,2.0,1354.0,3695.0,Aldibi Dinsprody,False
540,0571_04,Europa,True,C/21/P,,30.0,False,0.0,0.0,0.0,0.0,,Izark Hednigic,True
547,0576_01,Earth,False,F/107/S,,21.0,False,0.0,,625.0,110.0,0.0,Melice Herry,False
570,0597_01,Earth,True,G/91/P,,23.0,False,0.0,0.0,0.0,0.0,0.0,Arllia Roachoanand,True


In [18]:
# train test split
# drop unessesary collumns
y = titanic_df.loc[:,'Transported']
X = titanic_df.loc[:,(titanic_df.columns != 'Transported') & (titanic_df.columns != 'Name')]

#target encoding
def encode(X):
    for colname in X.select_dtypes("object"):
        X[colname], _ = X[colname].factorize()
    return X
X = encode(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [19]:
# scaling
def scale(X):
    X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
    return X_scaled
X_scaled = scale(X)

In [20]:
# null values 
def replace_nulls(data,train_variables,collumn_predict):
    # if nulls in train variable drop 
    for train_var in train_variables:
        data = data[data[train_var].isnull() == True,:].dropna()
    
    # get variable to predict
    data_pred = data[data[collumn_predict].isnull() == True,:]
    data_train = data[data[collumn_predict].isnull() == False,:]


    X = data_train[train_var]
    y = data_train[collumn_predict]
    
    reg = LinearRegression().fit(X, y)
    data[data[collumn_predict].isnull() == True,collumn_predict] = reg.predict(data_pred[train_var])
    return data
test = replace_nulls(X_scaled, ('HomePlanet','Age'),'Destination')
print(test)

KeyError: ('HomePlanet', 'Age')

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)

In [114]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
first_model_error = accuracy_score(y_test,predictions)
print('Random forest accuracy:  {}'.format(round(first_model_error,2)))

Random forest error:  0.74
