In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_with_nans.csv'

if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    print('Check if the model proba file exists. If not, run notebook 5.0-rp-fit-predict-save-proba-colab on Colab')

In [4]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
844371,0.009643,0.007124,0.971233,0.001146,0.001067,0.004761,0.001181,0.000568,0.000566,0.001506,0.000669,0.000535,6,25/10-10,1603.2144
1291513,0.014014,0.214075,0.72321,0.011066,0.001893,0.028613,0.001186,0.001066,0.001063,0.001334,0.001476,0.001004,9,29/3-1,3025.474001
426331,0.024239,0.029278,0.018977,0.161515,0.004599,0.584094,0.157228,0.010289,0.002232,0.002803,0.002637,0.00211,3,15/9-14,2909.892001
544720,0.190291,0.616963,0.124602,0.007013,0.002881,0.049708,0.001402,0.00126,0.001256,0.001577,0.00186,0.001187,3,35/9-8,2817.4856
650741,0.015248,0.100458,0.833627,0.012105,0.00192,0.026358,0.001348,0.001212,0.001208,0.002255,0.003118,0.001142,4,34/3-3 A,3635.921975
1330890,0.009526,0.0338,0.92921,0.008817,0.001287,0.011501,0.000904,0.000812,0.00081,0.001511,0.001057,0.000765,9,34/3-3 A,3061.209975
539015,0.765559,0.103199,0.041184,0.028703,0.003683,0.044541,0.002118,0.001903,0.001898,0.002383,0.003035,0.001794,3,35/6-2 S,3107.736467
153438,0.033998,0.030788,0.019034,0.156582,0.007459,0.553698,0.179438,0.006968,0.003203,0.002991,0.003591,0.002251,1,15/9-14,3013.100001
99465,0.007976,0.036209,0.920961,0.007626,0.00136,0.019785,0.000955,0.000858,0.000856,0.001597,0.001011,0.000809,0,34/3-3 A,3007.857976
1269930,0.005629,0.010888,0.976912,0.000781,0.00075,0.001701,0.000445,0.0004,0.000399,0.001246,0.000471,0.000377,9,25/5-3,1922.79519


In [5]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     1.000000e+00
std      3.617847e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [6]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
0,0.03854,0.055594,0.884845,0.002524,0.00141,0.007435,0.001522,0.001077,0.000735,0.004425,0.001198,0.000694,0,15/9-14,480.628001
1,0.009298,0.03233,0.948929,0.00114,0.000892,0.002302,0.001192,0.000509,0.000507,0.001726,0.000696,0.000479,0,15/9-14,480.780001
2,0.008961,0.032311,0.949015,0.001131,0.000885,0.002711,0.001151,0.000505,0.000503,0.001712,0.000638,0.000476,0,15/9-14,480.932001
3,0.008565,0.031168,0.950809,0.001047,0.000857,0.002634,0.001133,0.000497,0.000495,0.001685,0.000643,0.000468,0,15/9-14,481.084001
4,0.006072,0.019316,0.967779,0.000757,0.00062,0.001518,0.000841,0.000359,0.000358,0.001218,0.000824,0.000338,0,15/9-14,481.236001


## y_true

In [7]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [8]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [9]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities cumulative sum over models

In [10]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [11]:
models_probas_cumsum

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.385404,0.555939,8.848453,0.025241,0.014100,0.074350,0.015223,0.010771,0.007349,0.044246,0.011981,0.006945
1,0.092983,0.323299,9.489286,0.011399,0.008925,0.023019,0.011917,0.005087,0.005073,0.017258,0.006961,0.004794
2,0.089614,0.323113,9.490154,0.011309,0.008854,0.027107,0.011511,0.005047,0.005033,0.017122,0.006379,0.004756
3,0.085654,0.311676,9.508088,0.010472,0.008572,0.026342,0.011325,0.004966,0.004951,0.016845,0.006430,0.004679
4,0.060723,0.193160,9.677793,0.007570,0.006196,0.015178,0.008414,0.003590,0.003579,0.012178,0.008236,0.003383
...,...,...,...,...,...,...,...,...,...,...,...,...
136781,7.139156,2.277342,0.305344,0.020101,0.010308,0.179397,0.007646,0.008041,0.006851,0.008603,0.030736,0.006475
136782,7.238378,2.231765,0.264967,0.020237,0.010378,0.165463,0.007697,0.008095,0.006897,0.008661,0.030944,0.006518
136783,7.039409,2.332573,0.337855,0.023787,0.010806,0.183916,0.008015,0.008430,0.007182,0.009018,0.032221,0.006787
136784,6.459459,2.977263,0.358475,0.019727,0.010116,0.108317,0.007503,0.007891,0.006723,0.008443,0.029728,0.006354


# Lithology with highest cumsum probability per sample

In [12]:
# Turn column names from lith to ordinal
models_probas_cumsum.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_cumsum.columns]

In [13]:
y_pred = models_probas_cumsum.idxmax(axis=1)

# Score

In [14]:
open_test_score = score(y_true, y_pred)

In [15]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5442
