In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_sample = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
def rm_many_null_rows(df):
    # df should only be on training data
    list_count_of_null = []
    for i in range(len(df)):
        count_of_null = 0
        for j in df.columns:
#             print(f'{df[j].iloc[i]} --- ', end='')
            if str(df[j].iloc[i]).lower() == 'nan':
                count_of_null += 1
#         print('')
        list_count_of_null.append(count_of_null)
    df['NullCount'] = list_count_of_null
    return df

In [5]:
df_train = rm_many_null_rows(df_train)
df_train = df_train[df_train['NullCount'] < 3]

In [6]:
# feature engineering

def feat_eng(df):
    # cabin feat group
    cabin_group_list = []
    for cabin in df['Cabin']:
        try:
            cabin_group_list.append(cabin.split('/')[0])
        except:
            cabin_group_list.append(np.nan)
    df['CabinGroup'] = cabin_group_list
    
    # cabin feat zone
    cabin_zone_list = []
    for cabin in df['Cabin']:
        try:
            cabin_zone_list.append(cabin.split('/')[2])
        except:
            cabin_zone_list.append(np.nan)
    df['CabinZone'] = cabin_zone_list
    
    # family size
    passengerId_group_list = []
    for passenger in df['PassengerId']:
        try:
            passengerId_group_list.append(str(passenger.split('_')[0]))
        except:
            passengerId_group_list.append(np.nan)       
    df['PassengerIdGroup'] = passengerId_group_list
    
    df['FamilySize'] = df.groupby(['PassengerIdGroup'], dropna=False)['PassengerIdGroup'].transform('count')
    
    return df

# doing some data exploration

In [7]:
df = feat_eng(df_train).head()

In [8]:
# survive_count = []
# survive_total = []
# # overly complicated, not likely needed, the machine learning will figure this stuff out

# for i in df_data_cat['CabinGroupArea'].value_counts().index:
#     count = 0.0
#     total = 0
#     for j in range(len(df_data_cat)):
#         if df_data_cat['CabinGroupArea'].iloc[j] == i:
#             if str(df_data_cat['Transported'].iloc[j]) != 'nan':
#                 count += df_data_cat['Transported'].iloc[j]
#                 total += 1
#     survive_count.append(count)
#     survive_total.append(total)
    
# survive_avg = [survive_count[i]/survive_total[i] for i in range(len(survive_count))]

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = feat_eng(df_train).Transported
X = feat_eng(df_train).drop(['Transported'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
#                         X_train_full[cname].dtype == "object"]
categorical_cols = ['HomePlanet', 'CryoSleep', 'CabinGroup', 'CabinZone', 'Destination', 'VIP', 'FamilySize'] # 'Name'

# Select numerical columns
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Keep selected columns only
my_cols = ['PassengerId'] + categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [10]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,CabinGroup,CabinZone,Destination,VIP,FamilySize,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
6884,7290_02,Europa,True,D,P,55 Cancri e,False,5,35.0,0.0,0.0,0.0,0.0,0.0
984,1046_01,Earth,False,F,S,TRAPPIST-1e,False,1,16.0,507.0,0.0,0.0,0.0,
2869,3100_02,Earth,False,F,S,TRAPPIST-1e,False,3,52.0,809.0,0.0,0.0,0.0,0.0
143,0164_01,Earth,False,G,S,TRAPPIST-1e,False,2,57.0,50.0,1688.0,0.0,,135.0
4854,5180_01,Earth,True,G,P,PSO J318.5-22,False,1,20.0,0.0,0.0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4380,4659_01,Earth,False,F,S,TRAPPIST-1e,False,1,24.0,703.0,0.0,0.0,0.0,0.0
7908,8451_01,Mars,False,F,S,55 Cancri e,False,2,31.0,1997.0,0.0,0.0,0.0,0.0
4867,5193_01,Europa,False,C,S,TRAPPIST-1e,False,1,34.0,0.0,914.0,0.0,2433.0,647.0
3268,3511_01,Mars,False,F,S,55 Cancri e,False,1,22.0,124.0,0.0,1763.0,0.0,1014.0


In [11]:
# should replace or drop Group T

In [12]:
feat_eng(df_train).head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,NullCount,CabinGroup,CabinZone,PassengerIdGroup,FamilySize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0,B,P,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0,F,S,2,1


In [13]:
CrosstabResult=pd.crosstab(index=(feat_eng(df_train)['CabinGroup']+'_'+feat_eng(df_train)['CabinZone']),columns=feat_eng(df_train)['Transported'])
CrosstabResult['TransportRate'] = (CrosstabResult.iloc[:,1]/(CrosstabResult.iloc[:,0] + CrosstabResult.iloc[:,1])).round(decimals=2)
print(CrosstabResult)

Transported  False  True  TransportRate
row_0                                  
A_P             66    51           0.44
A_S             63    76           0.55
B_P            115   238           0.67
B_S             92   333           0.78
C_P            143   197           0.58
C_S             96   309           0.76
D_P            148   100           0.40
D_S            123   107           0.47
E_P            282   147           0.34
E_S            281   166           0.37
F_P            846   591           0.41
F_S            717   635           0.47
G_P            704   571           0.45
G_S            533   747           0.58
T_P              3     1           0.25
T_S              1     0           0.00


In [14]:
# CrosstabResult=pd.crosstab(index=feat_eng(df_train)['FamilySize'],columns=feat_eng(df_train)['Transported'])
# CrosstabResult['Divided col 0/1'] = (CrosstabResult.iloc[:,0] / CrosstabResult.iloc[:,1]).round(decimals=2)
# print(CrosstabResult)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [16]:
# evaluate the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def get_score(preprocessor, model):
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])
    
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=5,
                                  scoring='accuracy')
    return scores
# print(f"Accuracy scores:\n{scores}\n")
# print(f"Average scores: {round(scores.mean()*100, 2)}%")

In [17]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

models = []
# models.append(RandomForestClassifier(n_estimators=125, random_state=0))
# models.append(RandomForestClassifier(n_estimators=150, random_state=0))
# models.append(RandomForestClassifier(n_estimators=175, random_state=0))
models.append(RandomForestClassifier(n_estimators=200, random_state=0))
# models.append(RandomForestClassifier(n_estimators=1000, random_state=0))


# models.append(XGBClassifier(random_state=0))
# models.append(XGBClassifier(n_estimators=100, learning_rate=0.001, random_state=0))
# models.append(KNeighborsClassifier(n_neighbors=45))
# models.append(MultinomialNB())
# models.append(SVC(kernel='linear'))


for model in models:
    score = get_score(preprocessor, model)
    print(f"{model}\nAvg accuracy: {round(score.mean()*100, 2)}%\nStd of accuracy: {round(score.std()*100, 2)}%\n\n")

RandomForestClassifier(n_estimators=200, random_state=0)
Avg accuracy: 77.94%
Std of accuracy: 0.58%




In [18]:
from sklearn.metrics import accuracy_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [19]:
# Submitting

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(feat_eng(df_test))

In [20]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                       'Transported': [bool(i) for i in preds]})
output.to_csv('submission.csv', index=False)

In [21]:
output

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [22]:
df_sample

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
