<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.ParisHousingPrices/blob/stacking_submit/5_model_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
from google.colab import drive
# Datasets
import pandas as pd
# Numerics
import numpy as np
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing & pipelines
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Data management
from sklearn.model_selection import train_test_split
# Other
import os

pd.set_option('display.max_columns', None)

In [2]:
%%capture
GIT_DOWNLOAD_PATH = 'https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/'
FILES_LIST = [
    'path_manager.py',
    'sklearn_transformers.py',
    'sklearn_utils.py',
    'model.py'
]
GDRIVE_PATH = '/content/gdrive/MyDrive/'
PREPROC_TRIAL = 2
MODELS_TRIAL = 1
COMPETITION_PATH = GDRIVE_PATH + 'ML/Competitions/10.ParisHousePrices/'
# --------------------------------------------------
try:
    from nbpep8.nbpep8 import pep8
except ModuleNotFoundError:
    !pip install pycodestyle
    !pip install --index-url https://test.pypi.org/simple/ nbpep8
from nbpep8.nbpep8 import pep8
# ---------------------------------------
def download_files(url_dict):
    for file, url in url_dict.items():
        print(f'Downloading {file}')
        !wget -O {file} {url} {file}
url_dict = {file: GIT_DOWNLOAD_PATH + file for file in FILES_LIST}
print('a')
download_files(url_dict)
# ---------------------------------------
import importlib
import path_manager
import sklearn_utils
import sklearn_transformers
import model
def reload_all(modules_list_):
    for module in modules_list_:
        importlib.reload(module)
MODULES_LIST = [
    path_manager,
    sklearn_utils,
    sklearn_transformers,
    model
]
reload_all(MODULES_LIST)
# ---------------------------------------
from path_manager import PathManager
from model import Model
from sklearn_utils import nan_statistics
from sklearn_utils import boxplot_regression
from sklearn_utils import get_correlated_attributes
from sklearn_utils import visualize_datasets_distributions
from sklearn_transformers import ColumnDropper
from sklearn_transformers import LogTransformer
# ---------------------------------------
from google.colab import drive
drive.mount('/content/gdrive')
manager = PathManager(
    competition_path=COMPETITION_PATH,
    preprocessing_trial=PREPROC_TRIAL,
    models_trial=MODELS_TRIAL
)
manager.setup_paths()

# Download the data

In [3]:
os.listdir(manager.data_trial_path)

['train_processed.csv',
 'val_processed.csv',
 'test_processed.csv',
 'submission_processed.csv',
 'submission_elastic_20_10_23.csv',
 'rfr_20_10_23.csv']

In [5]:
df_train = pd.read_csv(f'{manager.data_trial_path}/train_processed.csv', index_col=0)
df_val = pd.read_csv(f'{manager.data_trial_path}/val_processed.csv', index_col=0)
df_test = pd.read_csv(f'{manager.data_trial_path}/test_processed.csv', index_col=0)
df_submission = pd.read_csv(f'{manager.data_trial_path}/submission_processed.csv', index_col=0)

In [6]:
X_train, Y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1]
X_val, Y_val = df_val.iloc[:, :-1], df_val.iloc[:, -1]
X_test, Y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1]
X_submission = df_submission.iloc[:, :]

Xs = [
    X_train,
    X_val,
    X_test,
    X_submission
]

Ys = [
    Y_train,
    Y_val,
    Y_test,
    None
]

# Download the models (final model)

In [9]:
all_models = manager.load_models()
final_model = all_models['StackRFRElastic']

In [10]:
final_model

In [11]:
from sklearn.metrics import mean_squared_error

def report_model(X, Y, model):
    print(
        mean_squared_error(
            y_true=Y,
            y_pred=model.predict(X),
            squared=False
        )
    )

In [13]:
report_model(X_train, Y_train, final_model)
report_model(X_val, Y_val, final_model)

7212.426405853513
9920.157045523343


# Test report (made only once, when concluding decision about the model is made)

In [None]:
report_model(X_test, Y_test, final_model)

# Retrain final model on all available data

In [17]:
final_model.fit(
    np.vstack(Xs[:-1]),  # all except X_submission,
    np.concatenate(Ys[:-1])  # all except Y_submission == None
)

In [19]:
report_model(X_train, Y_train, final_model)
report_model(X_val, Y_val, final_model)



4720.223796534433
4206.798919744916




# Prepare a submission file

In [None]:
def prepare_submission(submission_filename, model):
    df_submission_original_index = \
        pd.read_csv(f'{manager.data_root_path}/sample_submission.csv', index_col='id').index

    final_predictions = model.predict(X_submission)

    pd.DataFrame(
        {
            'id': df_submission_original_index,
            'price': final_predictions
        },
    ).to_csv(
        f'{manager.data_trial_path}/{submission_filename}',
        index=False
    )

In [32]:
submission_filename = 'stacking_rfr_elastic_21.10.23.csv'
prepare_submission(submission_filename, final_model)

# Save final model

In [34]:
manager.save_models(
    {'final_model': final_model}
)

In [35]:
final_model