<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.RwandaCO2Emissions/blob/main/3_model_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
# Datasets
import pandas as pd
# Numerics
import numpy as np
# Plotting
import matplotlib.pyplot as plt
# Preprocessing & pipelines
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Data management
from sklearn.model_selection import train_test_split
# Other
import os

pd.set_option('display.max_columns', None)

In [2]:
%%capture
GIT_DOWNLOAD_PATH = 'https://raw.githubusercontent.com/PashaIanko/Sklearn-Utils/main/'
FILES_LIST = [
    'path_manager.py',
    'sklearn_transformers.py',
    'sklearn_utils.py',
    'model.py'
]
GDRIVE_PATH = '/content/gdrive/MyDrive/'
PREPROC_TRIAL = 1
MODELS_TRIAL = 1
COMPETITION_PATH = GDRIVE_PATH + 'ML/Competitions/8.CO2Emissions/'
# --------------------------------------------------
try:
    from nbpep8.nbpep8 import pep8
except ModuleNotFoundError:
    !pip install pycodestyle
    !pip install --index-url https://test.pypi.org/simple/ nbpep8
from nbpep8.nbpep8 import pep8
# ---------------------------------------
def download_files(url_dict):
    for file, url in url_dict.items():
        print(f'Downloading {file}')
        !wget -O {file} {url} {file}
url_dict = {file: GIT_DOWNLOAD_PATH + file for file in FILES_LIST}
print('a')
download_files(url_dict)
# ---------------------------------------
import importlib
import path_manager
import sklearn_utils
import sklearn_transformers
import model
def reload_all(modules_list_):
    for module in modules_list_:
        importlib.reload(module)
MODULES_LIST = [
    path_manager,
    sklearn_utils,
    sklearn_transformers,
    model
]
reload_all(MODULES_LIST)
# ---------------------------------------
from path_manager import PathManager
from model import Model
from sklearn_utils import nan_statistics
from sklearn_utils import boxplot_regression
from sklearn_utils import get_correlated_attributes
from sklearn_utils import visualize_datasets_distributions
from sklearn_transformers import ColumnDropper
from sklearn_transformers import LogTransformer
# ---------------------------------------
from google.colab import drive
drive.mount('/content/gdrive')
manager = PathManager(
    competition_path=COMPETITION_PATH,
    preprocessing_trial=PREPROC_TRIAL,
    models_trial=MODELS_TRIAL
)
manager.setup_paths()

# Download the data

In [43]:
df_train = pd.read_csv(f'{manager.data_trial_path}/train_processed.csv', index_col=0)
df_val = pd.read_csv(f'{manager.data_trial_path}/val_processed.csv', index_col=0)
df_test = pd.read_csv(f'{manager.data_trial_path}/test_processed.csv', index_col=0)
df_submission = pd.read_csv(f'{manager.data_trial_path}/submission_processed.csv', index_col=0)

In [58]:
df_train_original = pd.read_csv(f'{manager.train_path}')

In [61]:
df_train_original.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,UvAerosolLayerHeight_aerosol_height,UvAerosolLayerHeight_aerosol_pressure,UvAerosolLayerHeight_aerosol_optical_depth,UvAerosolLayerHeight_sensor_zenith_angle,UvAerosolLayerHeight_sensor_azimuth_angle,UvAerosolLayerHeight_solar_azimuth_angle,UvAerosolLayerHeight_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,50.843559,-130.050797,35.874496,-2.7e-05,0.03537,1589.024536,4061.098145,829530.5,71.111977,52.775928,-149.875565,25.965214,,,,,,,,,,,,,0.000117,0.86323,3.8e-05,0.255668,35.874496,-130.050797,50.843559,-98.593887,-1.280761,829864.546875,-12.628979,35.632416,-138.786446,30.752128,0.115927,2.506609,0.295663,225.731144,0.595473,-12.628979,35.632416,-138.786446,30.752128,,,,,,,,0.595473,53534.732422,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,39.137194,-140.874435,28.965133,1.2e-05,0.036526,1772.574405,1869.040414,829787.28713,-1.019594,38.982368,-140.158048,29.562,4.7e-05,1.639765e-05,3e-05,9.3e-05,7311.869141,-1.935386,0.067038,829859.960368,5.471037,35.265195,-138.343908,30.054262,0.00017,1.172826,0.000143,0.200754,29.071781,-141.814827,43.050213,4.678839,-1.548119,829747.856973,16.152492,43.485327,-142.786141,28.573627,0.116775,2.657704,0.315733,226.17217,0.175166,24.464335,42.596541,-143.097868,28.213655,,,,,,,,0.213608,63790.296241,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,-150.191757,23.206415,0.000154,0.035338,2703.2368,2809.138386,829883.828686,-54.801144,52.344378,-133.683714,31.586838,3.1e-05,4.267369e-07,3e-05,8e-05,7311.869141,-2.754374,0.072135,829527.125,72.795837,52.868816,-150.191757,23.206415,8e-05,1.175467,1.9e-05,0.279449,30.99429,-135.66716,52.810561,-41.363579,-1.038673,829892.960629,-41.557633,41.269033,-135.364627,30.273304,0.117039,2.619104,0.310828,227.469292,0.606091,-41.557633,41.269033,-135.364627,30.273304,,,,,,,,0.70354,55923.790554,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,,,,,0.03679,2076.073332,3917.707873,829657.163571,28.916541,39.676184,-142.575915,24.810699,,,,,,,,,,,,,,,,,,,,,-0.626435,829794.848214,-0.00127,34.45874,-137.489602,26.936477,0.116434,2.525818,0.297966,225.58944,0.787398,-0.00127,34.45874,-137.489602,26.936477,,,,,,,,0.782806,44569.130636,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,35.515587,-137.409159,24.331972,-2.8e-05,0.034675,2053.60849,2667.310013,829735.09375,-12.501663,33.703073,-134.854258,24.629593,5.1e-05,2.056437e-05,3e-05,9.3e-05,7637.262458,-1.450563,0.049393,829744.84375,-13.431798,35.078624,-136.257947,24.729026,0.000269,0.869081,0.000146,0.16009,25.977935,-134.826557,39.949069,-12.837398,-1.584896,829736.142857,-0.604325,41.794705,-136.448518,25.045785,0.117373,2.572243,0.306688,228.95584,0.215739,-0.604325,41.794705,-136.448518,25.045785,,,,,,,,0.189336,59904.314844,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


# Sample the subset

In [4]:
SUBSET_SIZE = 0.05
RANDOM_STATE = 42


df_subset, _ = train_test_split(
    df_train,
    train_size=SUBSET_SIZE,
    random_state=RANDOM_STATE
)

print(df_subset.shape)

(2531, 85)


# Find the baseline model

## Random forest regressor

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


grid = {
    'n_estimators': [300]
}

res = GridSearchCV(
    RandomForestRegressor(),
    grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    refit=True,
    cv=3,
    verbose=4,
    return_train_score=True
)

In [12]:
cv_results = res.fit(
    X=df_subset.iloc[:, :-1].values,
    y=df_subset['target']
)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [15]:
cv_results.cv_results_

{'mean_fit_time': array([33.1655151]),
 'std_fit_time': array([2.11504644]),
 'mean_score_time': array([0.07895931]),
 'std_score_time': array([0.00652069]),
 'param_n_estimators': masked_array(data=[300],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 300}],
 'split0_test_score': array([-1512.79940571]),
 'split1_test_score': array([-1661.62874604]),
 'split2_test_score': array([-1666.09800466]),
 'mean_test_score': array([-1613.5087188]),
 'std_test_score': array([71.23560849]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([-232.02623107]),
 'split1_train_score': array([-235.47486798]),
 'split2_train_score': array([-222.52059878]),
 'mean_train_score': array([-230.00723261]),
 'std_train_score': array([5.4778669])}

In [9]:
from sklearn.metrics import mean_squared_error

mean_squared_error(
    res.predict(
        df_subset.iloc[:, :-1]
    ),
    df_subset['target']
)



176.7344697906446

In [11]:
mean_squared_error(
    res.predict(
        df_val.iloc[:1500, :-1]
    ),
    df_val['target'][:1500]
)



1435.2928295346253

## K Neighbors regressor

In [14]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

grid = {
   'n_neighbors': [5]
}

knn_estimator = GridSearchCV(
    KNeighborsRegressor(),
    grid,
    scoring='neg_mean_squared_error',
    cv=3,
)

In [17]:
knn_res = knn_estimator.fit(
    df_train.iloc[:, :-1],
    df_train['target'][:],
)

In [18]:
knn_res.cv_results_

{'mean_fit_time': array([0.0523773]),
 'std_fit_time': array([0.02193998]),
 'mean_score_time': array([7.50582846]),
 'std_score_time': array([1.81215108]),
 'param_n_neighbors': masked_array(data=[5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 5}],
 'split0_test_score': array([-3849.22534046]),
 'split1_test_score': array([-3875.42966489]),
 'split2_test_score': array([-3778.71986439]),
 'mean_test_score': array([-3834.45828991]),
 'std_test_score': array([40.83908026]),
 'rank_test_score': array([1], dtype=int32)}

## FFNN

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [52]:
model = Sequential(
    [
        Dense(200, activation='relu'),
        Dense(200, activation='relu'),
        Dense(200, activation='relu'),
        Dense(200, activation='relu'),
        Dense(1, activation='linear')
    ]
)

In [53]:
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
)

In [54]:
np.max(df_train.iloc[:, :-1].values)

12.059110901994984

In [55]:
df_train.shape

(50634, 84)

In [56]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,target
0,-0.033434,-0.223007,-0.769164,-0.470101,-1.960467,0.690431,-1.028234,-3.503432,-1.148337,0.823714,-0.231703,0.243616,-0.005711,-2.411621,0.740156,-1.229431,-3.234112,-1.693426,0.019272,0.585306,-1.123434,-0.032447,-1.144563,1.16915,-2.249568,0.65908,-0.505943,-1.721396,0.642096,-1.415943,0.457338,-1.116464,1.371678,1.123543,-1.763439,1.814344,-1.815579,1.849446,1.395672,0.458493,-1.010926,1.383267,-1.09122,1.468735,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903846,118.48082
1,0.399911,0.526932,-0.213859,-0.497901,0.882571,0.404851,0.257911,-0.057499,-0.907512,0.426898,-0.508529,-0.220758,-0.509467,-1.051296,-0.019064,-1.053178,-0.889699,0.45297,0.089177,0.329206,0.475656,0.242677,-0.896986,0.345865,0.351991,0.023576,0.186956,1.003661,-0.45936,0.489573,0.271003,-0.900059,0.335088,0.398733,0.370428,-0.31468,0.419668,-0.353951,1.663656,1.652544,0.513775,1.504095,-0.88367,0.362673,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.846154,116.39622
2,1.482563,0.975655,-1.209269,-0.225636,0.149723,-1.059689,-0.115722,1.016351,-0.877435,0.473048,-1.152748,1.538541,-1.18852,0.712331,0.525428,-0.919142,-0.073169,-0.590529,-0.130605,-1.681386,0.472466,0.988812,-0.891311,0.210462,-0.359438,0.915939,0.651777,-0.620766,0.802154,0.486382,1.018525,-0.894368,0.199244,0.686313,-0.792108,0.861051,-0.769549,0.847335,1.800205,0.438204,0.343329,0.805908,-0.875297,0.237213,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.826923,63.855354
3,-0.753781,-1.471251,0.998603,-0.720235,-0.423673,1.001425,0.793979,-0.885459,1.335301,0.779835,-0.561156,-1.419071,-0.755817,0.381892,0.69546,1.330607,-0.9749,0.729629,0.357126,1.755863,0.002333,-1.745838,1.278407,1.164335,0.37491,-1.243205,-0.985171,0.779632,0.205166,0.016096,-1.721206,1.281498,1.156217,1.079003,0.591194,-0.760027,0.748161,-0.856757,-0.640983,-0.565917,0.520229,-1.48375,1.295927,1.098152,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519231,28.91566
4,-1.334889,0.624858,0.232703,0.166392,0.900292,1.909921,-0.900427,-0.386418,-0.437383,0.59981,-2.195334,-0.608768,-2.193875,1.685348,1.036409,-0.481085,0.422827,-1.623117,-0.303133,1.076188,-0.017987,-0.48161,-0.463535,-0.431207,1.553509,-0.077799,0.700507,0.897753,1.185788,-0.00423,-0.454628,-0.46538,-0.444509,0.991158,0.164414,-0.220436,0.259887,-0.289534,-0.563488,-0.374854,-0.081328,-0.500479,-0.451784,-0.391965,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.788462,44.708138


In [57]:
history = model.fit(
    x=df_train.iloc[:, :-1],
    y=df_train['target'],
    epochs=10,
    validation_split=0.2,
    batch_size=32,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
