<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.Spaceship-Titanic/blob/main/5_model_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [10]:
from google.colab import drive

import pandas as pd

import os

## PEP8 standard

In [2]:
# PEP 8 
%%capture 
!pip install pycodestyle 
!pip install --index-url https://test.pypi.org/simple/ nbpep8 
from nbpep8.nbpep8 import pep8

## Importing custom packages

In [3]:
%%capture
def download_files(url_dict):
    for file, url in url_dict.items():
        !wget -O {file} {url} {file}


git_download_path = \
    'https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/'

files = [
    'path_manager.py',
    'sklearn_transformers.py',
    'sklearn_utils.py',
    'model.py'
]

url_dict = {file: git_download_path + file for file in files}
download_files(url_dict)

In [4]:
import importlib
import path_manager
# import sklearn_utils
# import sklearn_transformers
# import model


def reload_modules(modules):
    for module in modules:
        importlib.reload(module)


reload_modules(
    [
        path_manager,
        # sklearn_transformers,
        # sklearn_utils,
        # model
    ]
)

from path_manager import PathManager
# from sklearn_transformers import ColumnDropper
# from sklearn_utils import inf_statistics
# from model import Model

# Set up paths

In [5]:
drive.mount('/content/gdrive', force_remount=True)
gdrive_path = '/content/gdrive/MyDrive/'

PREPROC_TRIAL = 1
MODELS_TRIAL = 1
COMPETITION_PATH = gdrive_path + 'ML/Competitions/6.SpaceshipTitanic/'

manager = PathManager(
    competition_path=COMPETITION_PATH,
    preprocessing_trial=PREPROC_TRIAL,
    models_trial=MODELS_TRIAL
)
manager.setup_paths()

Mounted at /content/gdrive
/content/gdrive/MyDrive/ML/Competitions/6.SpaceshipTitanic/Data/preproc_trial_1 already exists
/content/gdrive/MyDrive/ML/Competitions/6.SpaceshipTitanic/Models/trial_1 already exists


# Download the trained models

In [8]:
models = manager.load_models(
    [
        'RandomForest_gridsearch',
        'LogReg_gridsearch'
    ]
)
models

{'RandomForest_gridsearch': RandomForestClassifier(max_depth=6, max_features='sqrt', n_estimators=75,
                        n_jobs=-1, random_state=42),
 'LogReg_gridsearch': LogisticRegression(C=0.1, l1_ratio=0, penalty='elasticnet', random_state=42,
                    solver='saga')}

# Download the data

In [12]:
os.listdir(manager.data_trial_path)

['train_processed.csv',
 'trainval_processed.csv',
 'test_sample_processed.csv',
 'test_processed.csv']

In [19]:
df_test_sample = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'test_sample_processed.csv'
    )
)

X_test_sample = df_test_sample.values[:, :-1]
Y_test_sample = df_test_sample.values[:, -1]
df_test_sample.shape

(1739, 18)

In [20]:
df_test_processed = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'test_processed.csv'
    )
)
X_test = df_test_processed.values[:, :]
df_test_processed.shape

(4277, 17)

In [31]:
df_trainval_processed = pd.read_csv(
    os.path.join(
        manager.data_trial_path,
        'trainval_processed.csv'
    )
)

X_trainval_processed = df_trainval_processed.values[:, :-1]
Y_trainval_processed = df_trainval_processed.values[:, -1]

df_trainval_processed.shape

(6954, 18)

# Report test sample performance

In [18]:
models['RandomForest_gridsearch'].predict(
    df_test_sample.values[:, :-1]
)

array([1., 1., 0., ..., 1., 0., 0.])

In [29]:
from sklearn.metrics import accuracy_score


def get_accuracy(model_, X_, Y_):
    return accuracy_score(
        model_.predict(X_),
        Y_
    )


for model_name, model in models.items():
    print(
        f'''
        {model_name}: {get_accuracy(model, X_test_sample, Y_test_sample)}
        '''
    )

pep8(_ih)


        RandomForest_gridsearch: 0.777458309373203
        

        LogReg_gridsearch: 0.7929844738355377
        
cell_content.py:17:1: W293 blank line contains whitespace



# Retrain on full available data

In [40]:
import numpy as np

X_full = np.vstack([X_trainval_processed, X_test_sample])
Y_full = np.concatenate([Y_trainval_processed, Y_test_sample])

assert X_full.shape[0] == len(Y_full)

In [41]:
final_model = models['LogReg_gridsearch']

final_model.fit(
    X_full,
    Y_full
)



LogisticRegression(C=0.1, l1_ratio=0, penalty='elasticnet', random_state=42,
                   solver='saga')

# Save predictions

In [54]:
df_submission_example = pd.read_csv(
    os.path.join(
        manager.data_root_path,
        'sample_submission.csv'
    )
)
df_submission_example

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [58]:
# Downloading original test set
df_submission = pd.read_csv(
    os.path.join(
        manager.data_root_path,
        'test.csv'
    )
)

df_submission = df_submission.loc[:, ['PassengerId']]
predictions = final_model.predict(
    df_test_processed.values
)
df_submission['Transported'] = predictions
df_submission['Transported'] = df_submission['Transported'].astype('bool')
df_submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [59]:
final_model_name = 'LogReg_final'

submission_path = os.path.join(
    manager.models_trial_path,
    final_model_name,
    'submission.csv'
)

manager.save_models(
    {
        final_model_name: final_model
    }
)

df_submission.to_csv(submission_path)