<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.RwandaCO2Emissions/blob/data_preparation_baseline/2_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [20]:
from google.colab import drive

# Datasets
import pandas as pd

# Numerics
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Preprocessing & pipelines
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Data management
from sklearn.model_selection import train_test_split

# Other
import os

# Setup

In [3]:
# %%capture
GIT_DOWNLOAD_PATH = 'https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/'
FILES_LIST = [
    'path_manager.py',
    'sklearn_transformers.py',
    'sklearn_utils.py',
    'model.py'
]
GDRIVE_PATH = '/content/gdrive/MyDrive/'
PREPROC_TRIAL = 1
MODELS_TRIAL = 1
COMPETITION_PATH = GDRIVE_PATH + 'ML/Competitions/8.CO2Emissions/' # e.g. GDRIVE_PATH + 'ML/Competitions/8.CO2Emissions/'
# --------------------------------------------------
try:
    from nbpep8.nbpep8 import pep8
except ModuleNotFoundError:
    !pip install pycodestyle
    !pip install --index-url https://test.pypi.org/simple/ nbpep8
from nbpep8.nbpep8 import pep8
# ---------------------------------------
def download_files(url_dict):
    for file, url in url_dict.items():
        print(f'Downloading {file}')
        !wget -O {file} {url} {file}
url_dict = {file: GIT_DOWNLOAD_PATH + file for file in FILES_LIST}
print('a')
download_files(url_dict)
# ---------------------------------------
import importlib
import path_manager
import sklearn_utils
import sklearn_transformers
import model
def reload_all(modules_list_):
    for module in modules_list_:
        importlib.reload(module)
MODULES_LIST = [
    path_manager,
    sklearn_utils,
    sklearn_transformers,
    model
]
reload_all(MODULES_LIST)
# ---------------------------------------
from path_manager import PathManager
from model import Model
from sklearn_utils import nan_statistics
from sklearn_utils import boxplot_regression
from sklearn_utils import get_correlated_attributes
from sklearn_utils import visualize_datasets_distributions
from sklearn_transformers import ColumnDropper
from sklearn_transformers import LogTransformer
# ---------------------------------------
from google.colab import drive
drive.mount('/content/gdrive')
manager = PathManager(
    competition_path=COMPETITION_PATH,
    preprocessing_trial=PREPROC_TRIAL,
    models_trial=MODELS_TRIAL
)
manager.setup_paths()

Collecting pycodestyle
  Downloading pycodestyle-2.11.0-py2.py3-none-any.whl (31 kB)
Installing collected packages: pycodestyle
Successfully installed pycodestyle-2.11.0
Looking in indexes: https://test.pypi.org/simple/
Collecting nbpep8
  Downloading https://test-files.pythonhosted.org/packages/c1/07/6b91c986efe7c3adac9e2ec061037f0cc4307925819d37277c3802c2d117/nbpep8-0.0.15-py3-none-any.whl (3.2 kB)
Installing collected packages: nbpep8
Successfully installed nbpep8-0.0.15
a
Downloading path_manager.py
--2023-09-28 15:18:57--  https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/path_manager.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3482 (3.4K) [text/plain]
Saving to: ‘path_manager.py’


2023-09-28 15:18:57 (41.4 MB/s) - ‘path_manager.py

# Download the data

In [16]:
df_train = pd.read_csv(manager.train_path)

In [17]:
df_submission = pd.read_csv(manager.test_path)

In [18]:
assert len(df_train.columns) == len(df_submission.columns) + 1
assert set(df_train.columns) - set(df_submission.columns) == {'emission'}

# Split the data

- We split available, labeled training data into **trainval** and **test set**

In [27]:
TRAINVAL_PERCENT = 0.85
TRAIN_PERCENT = 0.8
RANDOM_STATE = 42
TARGET_COLUMN = 'emission'

df_trainval, df_test = train_test_split(
    df_train,
    train_size=TRAINVAL_PERCENT,
    random_state=RANDOM_STATE
)

df_train, df_val = train_test_split(
    df_trainval,
    train_size=TRAIN_PERCENT,
    random_state=RANDOM_STATE
)

df_train.shape, df_val.shape, df_test.shape

((16836, 76), (4210, 76), (3714, 76))

## Make sure train, validation and test are from the same distribution

# Not implemented, the code takes forever, big datasets

# Process the data

## NAN check

In [None]:
_ = nan_statistics(train_df)