In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 9.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw/models_proba_grouped/00'

In [4]:
file_ids = {
    'models_proba_grouped_baat_gp.csv': '1_FQOF2pf-r49OQVn-3INssy24aQqrFEa',
    'models_proba_grouped_cromer_knoll_gp.csv': '1-6U4gaPycCzvOoUxkLwnvjJJukUlEoFZ',
    'models_proba_grouped_dunlin_gp.csv': '1-6_rjkHeD-htgR5mQRzsH4WbivLSy21E',
    'models_proba_grouped_hegre_gp.csv': '1-NkNDg3jQMuxQYphxoKJPbs_ivoBIuCM',
    'models_proba_grouped_hordaland_gp.csv': '1-Sc9tS8UzoQH0hGh3G009jPHwM1kQ9XI',
    'models_proba_grouped_nordland_gp.csv': '1-VF2vMSL6-WFtNgQgZhY2Ua7ovFcUOgO',
    'models_proba_grouped_permian_gp.csv': '1-niVU5pOtyAaaOFDZbaoB1vvR_bAg9IN',
    'models_proba_grouped_rogaland_gp.csv': '1-oIPKtdr5tuxjEWlQh_G4VwT7Sn4qlzc',
    'models_proba_grouped_shetland_gp.csv': '1-sBpX-_Zrr3fWGggznI_pU8D15vEHh7E',
    'models_proba_grouped_vestland_gp.csv': '1-tKimccICbsxvy6IfvJBl_ghT3qAQWxm',
    'models_proba_grouped_vtb_gp.csv': '1-vFfXHqFCADH83TWXoO4FW68sp3bWcTk'
}

In [5]:
download_from_google_drive(file_ids, output_root=models_proba_path)

In [6]:
'models_proba_grouped_cromer_knoll_gp'.split('_', maxsplit=3)

['models', 'proba', 'grouped', 'cromer_knoll_gp']

In [7]:
models_probas = []
for file in models_proba_path.glob('*.csv'):
    model_proba = pd.read_csv(file)
    
    # Get GROUPED name from filename
    group = file.stem
    group = group.split('_', maxsplit=3)[3]
    
    # Assign group to keep track of data origin
    model_proba['GROUPED'] = group
    
    models_probas.append(model_proba)
    
models_probas = pd.concat(models_probas, ignore_index=True)

In [8]:
models_probas.columns

Index(['Sandstone', 'Sandstone/Shale', 'Shale', 'Marl', 'Dolomite',
       'Limestone', 'Chalk', 'Coal', 'MODEL', 'index', 'WELL', 'DEPTH_MD',
       'GROUPED', 'Anhydrite', 'Halite', 'Basement', 'Tuff'],
      dtype='object')

In [9]:
cols_ordered = ['Sandstone', 'Sandstone/Shale', 'Shale', 'Marl', 'Dolomite',
                'Limestone', 'Chalk', 'Coal', 'Anhydrite', 'Halite', 'Basement', 'Tuff',
                'GROUPED', 'MODEL', 'index', 'WELL', 'DEPTH_MD'
               ]

In [10]:
models_probas = models_probas.loc[:, cols_ordered]

In [11]:
models_probas.fillna(0.0, inplace=True)

In [12]:
models_probas.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Coal,Anhydrite,Halite,Basement,Tuff,GROUPED,MODEL,index,WELL,DEPTH_MD
476386,0.033765,0.134936,0.788819,0.011032,0.005391,0.021663,0.004395,0.0,0.0,0.0,0.0,0.0,shetland_gp,3,97823,34/3-3 A,2758.273975
286877,0.019461,0.291354,0.665384,0.005093,0.004419,0.009437,0.0,0.004852,0.0,0.0,0.0,0.0,dunlin_gp,3,68698,29/3-1,4292.394001
133811,0.637828,0.302166,0.029786,0.006814,0.006184,0.008555,0.0,0.008668,0.0,0.0,0.0,0.0,baat_gp,0,88962,34/10-16 R,3210.904008
543222,0.010486,0.011696,0.908032,0.024778,0.004272,0.037206,0.00353,0.0,0.0,0.0,0.0,0.0,shetland_gp,4,126138,35/6-2 S,2722.872467
263771,0.049593,0.392696,0.523095,0.007381,0.006403,0.013799,0.0,0.007032,0.0,0.0,0.0,0.0,dunlin_gp,1,116392,34/6-1 S,3938.2184
647842,0.02253,0.024584,0.942055,0.00199,0.002483,0.003901,0.0,0.0,0.0,0.0,0.0,0.002457,hordaland_gp,3,50676,29/3-1,1539.826001
609046,0.033289,0.050485,0.881468,0.003276,0.004031,0.023709,0.0,0.0,0.0,0.0,0.0,0.00374,hordaland_gp,2,20627,25/10-10,1142.9584
470064,0.004669,0.020035,0.95713,0.007307,0.001893,0.007579,0.001386,0.0,0.0,0.0,0.0,0.0,shetland_gp,3,84503,34/10-16 R,2533.136008
181549,0.785763,0.07118,0.125806,0.005506,0.0,0.006117,0.0,0.005629,0.0,0.0,0.0,0.0,nordland_gp,1,45728,29/3-1,787.730001
53529,0.002677,0.006626,0.976444,0.003447,0.001666,0.007252,0.0,0.000937,0.000951,0.0,0.0,0.0,vtb_gp,4,17756,15/9-14,3180.908001


In [13]:
# The sum of all probabilities for each sample should be 1
models_probas.loc[:, 'Sandstone':'Tuff'].sum(axis=1).describe()

count    6.839300e+05
mean     1.000000e+00
std      3.576565e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [14]:
models_probas.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Coal,Anhydrite,Halite,Basement,Tuff,GROUPED,MODEL,index,WELL,DEPTH_MD
0,0.05594,0.200757,0.112594,0.402318,0.013897,0.188835,0.012885,0.012775,0.0,0.0,0.0,0.0,cromer_knoll_gp,0,17106,15/9-14,3082.108001
1,0.105826,0.208147,0.207906,0.337497,0.01212,0.106125,0.011238,0.011142,0.0,0.0,0.0,0.0,cromer_knoll_gp,0,17107,15/9-14,3082.260001
2,0.103384,0.202475,0.216792,0.344278,0.011771,0.099564,0.010914,0.010821,0.0,0.0,0.0,0.0,cromer_knoll_gp,0,17108,15/9-14,3082.412001
3,0.106626,0.205785,0.201786,0.347552,0.011964,0.104195,0.011093,0.010998,0.0,0.0,0.0,0.0,cromer_knoll_gp,0,17109,15/9-14,3082.564001
4,0.112357,0.202248,0.187753,0.371444,0.011752,0.092747,0.010896,0.010803,0.0,0.0,0.0,0.0,cromer_knoll_gp,0,17110,15/9-14,3082.716001


## y_true

In [15]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [16]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [17]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

In [18]:
len(y_true)

136786

# Probabilities mean over models

In [19]:
models_probas_cumsums = models_probas.groupby('index').sum()

models_probas_cumsums.head()

Unnamed: 0_level_0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Coal,Anhydrite,Halite,Basement,Tuff,MODEL,DEPTH_MD
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.622447,0.901604,3.322146,0.054099,0.0,0.05062,0.0,0.049083,0.0,0.0,0.0,0.0,10,2403.140004
1,0.289081,1.504807,2.920684,0.100398,0.0,0.093941,0.0,0.091089,0.0,0.0,0.0,0.0,10,2403.900004
2,0.303417,1.572405,2.831906,0.104845,0.0,0.096454,0.0,0.090974,0.0,0.0,0.0,0.0,10,2404.660004
3,0.330296,1.320263,3.06888,0.10006,0.0,0.094218,0.0,0.086283,0.0,0.0,0.0,0.0,10,2405.420004
4,0.231459,0.889359,3.640302,0.087052,0.0,0.077084,0.0,0.074744,0.0,0.0,0.0,0.0,10,2406.180004


In [20]:
models_probas_cumsums = models_probas_cumsums.loc[:, 'Sandstone':'Tuff']

In [21]:
models_probas_cumsums.head()

Unnamed: 0_level_0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Coal,Anhydrite,Halite,Basement,Tuff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.622447,0.901604,3.322146,0.054099,0.0,0.05062,0.0,0.049083,0.0,0.0,0.0,0.0
1,0.289081,1.504807,2.920684,0.100398,0.0,0.093941,0.0,0.091089,0.0,0.0,0.0,0.0
2,0.303417,1.572405,2.831906,0.104845,0.0,0.096454,0.0,0.090974,0.0,0.0,0.0,0.0
3,0.330296,1.320263,3.06888,0.10006,0.0,0.094218,0.0,0.086283,0.0,0.0,0.0,0.0
4,0.231459,0.889359,3.640302,0.087052,0.0,0.077084,0.0,0.074744,0.0,0.0,0.0,0.0


In [22]:
models_probas_cumsums.shape

(136786, 12)

In [23]:
models_len = len(models_probas['MODEL'].unique())

In [24]:
models_probas_mean = models_probas_cumsums / models_len

In [25]:
models_probas_mean.head()

Unnamed: 0_level_0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Coal,Anhydrite,Halite,Basement,Tuff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.124489,0.180321,0.664429,0.01082,0.0,0.010124,0.0,0.009817,0.0,0.0,0.0,0.0
1,0.057816,0.300961,0.584137,0.02008,0.0,0.018788,0.0,0.018218,0.0,0.0,0.0,0.0
2,0.060683,0.314481,0.566381,0.020969,0.0,0.019291,0.0,0.018195,0.0,0.0,0.0,0.0
3,0.066059,0.264053,0.613776,0.020012,0.0,0.018844,0.0,0.017257,0.0,0.0,0.0,0.0
4,0.046292,0.177872,0.72806,0.01741,0.0,0.015417,0.0,0.014949,0.0,0.0,0.0,0.0


# Lithology with highest cumsum probability per sample

In [26]:
# Turn column names from lith to ordinal
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [27]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [28]:
open_test_score = score(y_true, y_pred)

In [29]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5674
