<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.Spaceship-Titanic/blob/feature_selection/5_model_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
from google.colab import drive

import pandas as pd

import os

In [2]:
%%capture
!pip install catboost

## PEP8 standard

In [3]:
# PEP 8 
%%capture 
!pip install pycodestyle 
!pip install --index-url https://test.pypi.org/simple/ nbpep8 
from nbpep8.nbpep8 import pep8

## Importing custom packages

In [4]:
%%capture
def download_files(url_dict):
    for file, url in url_dict.items():
        !wget -O {file} {url} {file}


git_download_path = \
    'https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/'

files = [
    'path_manager.py',
    'sklearn_transformers.py',
    'sklearn_utils.py',
    'model.py'
]

url_dict = {file: git_download_path + file for file in files}
download_files(url_dict)

In [5]:
import importlib
import path_manager
# import sklearn_utils
# import sklearn_transformers
# import model


def reload_modules(modules):
    for module in modules:
        importlib.reload(module)


reload_modules(
    [
        path_manager,
        # sklearn_transformers,
        # sklearn_utils,
        # model
    ]
)

from path_manager import PathManager
# from sklearn_transformers import ColumnDropper
# from sklearn_utils import inf_statistics
# from model import Model

# Set up paths

In [6]:
drive.mount('/content/gdrive')
gdrive_path = '/content/gdrive/MyDrive/'

PREPROC_TRIAL = 10
MODELS_TRIAL = 10
COMPETITION_PATH = gdrive_path + 'ML/Competitions/6.SpaceshipTitanic/'

manager = PathManager(
    competition_path=COMPETITION_PATH,
    preprocessing_trial=PREPROC_TRIAL,
    models_trial=MODELS_TRIAL
)
manager.setup_paths()

Mounted at /content/gdrive
/content/gdrive/MyDrive/ML/Competitions/6.SpaceshipTitanic/Data/preproc_trial_10 already exists
/content/gdrive/MyDrive/ML/Competitions/6.SpaceshipTitanic/Models/trial_10 already exists


# Download the trained models

In [7]:
models = manager.load_models(
    [
        # 'RandomForest_gridsearch',
        # 'LogReg_gridsearch'
        # 'XGB_gridsearch'
        # 'CatBoost_gridsearch'
        # 'LGBM_gridsearch',
        'LGBMOptuna',
    ]
)
models

{'LGBMOptuna': LGBMClassifier(boosting_type='dart', learning_rate=0.115, max_depth=12,
                min_child_samples=1, n_estimators=475, num_leaves=7,
                objective='binary', reg_alpha=0.65, reg_lambda=0.95)}

# Download the data

In [8]:
os.listdir(manager.data_trial_path)

['trainval_processed.csv', 'test_sample_processed.csv', 'test_processed.csv']

In [9]:
df_test_sample = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'test_sample_processed.csv',
    ),
    index_col='Unnamed: 0'
)

X_test_sample = df_test_sample.values[:, :-1]
Y_test_sample = df_test_sample.values[:, -1]
df_test_sample.shape

(1739, 24)

In [10]:
df_test_processed = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'test_processed.csv'
    ),
    index_col='Unnamed: 0'
)
X_test = df_test_processed.values[:, :]
df_test_processed.shape

(4277, 23)

In [11]:
df_trainval_processed = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'trainval_processed.csv'
    ),
    index_col='Unnamed: 0'
)

X_trainval_processed = df_trainval_processed.values[:, :-1]
Y_trainval_processed = df_trainval_processed.values[:, -1]

df_trainval_processed.shape

(6954, 24)

# Report test sample performance

In [12]:
from sklearn.metrics import accuracy_score


def get_accuracy(model_, X_, Y_):
    return accuracy_score(
        model_.predict(X_),
        Y_
    )


for model_name, model in models.items():
    print(
        f'''
        {model_name}: {get_accuracy(model, X_test_sample, Y_test_sample)}
        '''
    )

pep8(_ih)


        LGBMOptuna: 0.7952846463484762
        



# Retrain on full available data

In [13]:
import numpy as np

X_full = np.vstack([X_trainval_processed, X_test_sample])
Y_full = np.concatenate([Y_trainval_processed, Y_test_sample])

assert X_full.shape[0] == len(Y_full)

In [14]:
final_model = models['LGBMOptuna']  # models['LogReg_gridsearch']

final_model.fit(
    X_full,
    Y_full
)

predictions = final_model.predict(df_test_processed.values)

# Save predictions

In [15]:
# Downloading original test set
df_submission = pd.read_csv(
    os.path.join(
        manager.data_root_path,
        'test.csv'
    )
)

df_submission = df_submission.loc[:, ['PassengerId']]
predictions = final_model.predict(
    df_test_processed.values
)
df_submission['Transported'] = predictions
df_submission['Transported'] = df_submission['Transported'].astype('bool')
df_submission.set_index(df_submission['PassengerId'], inplace=True)
df_submission.drop(['PassengerId'], axis='columns', inplace=True)
df_submission.head()

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [16]:
final_model_name = 'LGBMOptuna_final'

submission_path = os.path.join(
    manager.models_trial_path,
    final_model_name,
    'submission.csv'
)

manager.save_models(
    {
        final_model_name: final_model
    }
)

df_submission.to_csv(submission_path)