In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_sample = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
def rm_many_null_rows(df):
    # df should only be on training data
    list_count_of_null = []
    for i in range(len(df)):
        count_of_null = 0
        for j in df.columns:
#             print(f'{df[j].iloc[i]} --- ', end='')
            if str(df[j].iloc[i]).lower() == 'nan':
                count_of_null += 1
#         print('')
        list_count_of_null.append(count_of_null)
    df['NullCount'] = list_count_of_null
    return df

In [None]:
df_train = rm_many_null_rows(df_train)
df_train = df_train[df_train['NullCount'] < 3]

In [None]:
# feature engineering

def feat_eng(df):
    # cabin feat group
    cabin_group_list = []
    for cabin in df['Cabin']:
        try:
            cabin_group_list.append(cabin.split('/')[0])
        except:
            cabin_group_list.append(np.nan)
    df['CabinGroup'] = cabin_group_list
    
    # cabin feat zone
    cabin_zone_list = []
    for cabin in df['Cabin']:
        try:
            cabin_zone_list.append(cabin.split('/')[2])
        except:
            cabin_zone_list.append(np.nan)
    df['CabinZone'] = cabin_zone_list
    
    # family size
    passengerId_group_list = []
    for passenger in df['PassengerId']:
        try:
            passengerId_group_list.append(str(passenger.split('_')[0]))
        except:
            passengerId_group_list.append(np.nan)       
    df['PassengerIdGroup'] = passengerId_group_list
    
    df['FamilySize'] = df.groupby(['PassengerIdGroup'], dropna=False)['PassengerIdGroup'].transform('count')
    
    return df

In [None]:
feat_eng(df_train).head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = feat_eng(df_train).Transported
X = feat_eng(df_train).drop(['Transported'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
#                         X_train_full[cname].dtype == "object"]
categorical_cols = ['HomePlanet', 'CryoSleep', 'CabinGroup', 'CabinZone', 'Destination', 'VIP', 'FamilySize'] # 'Name'

# Select numerical columns
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Keep selected columns only
my_cols = ['PassengerId'] + categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# evaluate the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def get_score(preprocessor, model):
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                                 ])
    
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(my_pipeline, X, y,
                                  cv=5,
                                  scoring='accuracy')
    return scores
# print(f"Accuracy scores:\n{scores}\n")
# print(f"Average scores: {round(scores.mean()*100, 2)}%")

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

models = []
# models.append(RandomForestClassifier(n_estimators=125, random_state=0))
# models.append(RandomForestClassifier(n_estimators=150, random_state=0))
# models.append(RandomForestClassifier(n_estimators=175, random_state=0))
# models.append(RandomForestClassifier(n_estimators=200, random_state=0))
# models.append(RandomForestClassifier(n_estimators=1000, random_state=0))


# models.append(XGBClassifier(random_state=0))
models.append(XGBClassifier(n_estimators=100, learning_rate=0.001, random_state=0))
# models.append(KNeighborsClassifier(n_neighbors=45))
# models.append(MultinomialNB())
# models.append(SVC(kernel='linear'))


for model in models:
    score = get_score(preprocessor, model)
    print(f"{model}\nAvg accuracy: {round(score.mean()*100, 2)}%\nStd of accuracy: {round(score.std()*100, 2)}%\n\n")

In [None]:
from sklearn.metrics import accuracy_score

model = models[0]

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [None]:
# Submitting

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(feat_eng(df_test))

In [None]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                       'Transported': [bool(i) for i in preds]})
output.to_csv('submission.csv', index=False)

In [None]:
output