In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_with_nans.csv'

if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    print('Check if the model proba file exists. If not, run notebook 5.0-rp-fit-predict-save-proba-colab on Colab')

In [4]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
271289,0.685231,0.26029,0.042673,0.002013,0.000965,0.00466,0.000716,0.000643,0.000641,0.000805,0.000757,0.000606,1,35/9-8,2838.9176
178905,0.826419,0.126499,0.017518,0.00848,0.004379,0.010648,0.000773,0.000695,0.000693,0.00087,0.00237,0.000655,1,25/5-3,2418.77119
1004504,0.981529,0.006618,0.007162,0.000466,0.000513,0.001035,0.000726,0.000337,0.000252,0.000825,0.000298,0.000238,7,29/3-1,981.378001
900457,0.003896,0.053551,0.930868,0.000503,0.000447,0.001777,0.000296,0.000263,0.000262,0.007465,0.000425,0.000247,6,34/10-16 R,1809.312008
353565,0.003884,0.077053,0.90054,0.000593,0.000589,0.002366,0.000355,0.00031,0.000309,0.013196,0.000512,0.000292,2,34/10-16 R,1847.616008
305312,0.002871,0.016616,0.969912,0.002121,0.000771,0.003866,0.000719,0.00041,0.000409,0.001389,0.000528,0.000387,2,25/11-24,1782.1472
676623,0.730162,0.081062,0.043414,0.026646,0.009053,0.095357,0.002066,0.002173,0.002447,0.002325,0.003546,0.00175,4,35/6-2 S,3234.656467
693063,0.00191,0.005137,0.986831,0.000405,0.000344,0.001544,0.000557,0.000199,0.000199,0.002436,0.000249,0.000188,5,15/9-14,1870.212001
989311,0.277623,0.044402,0.630735,0.008663,0.002973,0.011566,0.003385,0.001582,0.001578,0.014028,0.001972,0.001491,7,25/11-24,1792.6352
384707,0.010411,0.044245,0.923805,0.001945,0.001071,0.010486,0.000759,0.000682,0.00068,0.001269,0.004004,0.000643,2,34/3-3 A,4782.153976


In [5]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     1.000000e+00
std      3.617847e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [6]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
0,0.03854,0.055594,0.884845,0.002524,0.00141,0.007435,0.001522,0.001077,0.000735,0.004425,0.001198,0.000694,0,15/9-14,480.628001
1,0.009298,0.03233,0.948929,0.00114,0.000892,0.002302,0.001192,0.000509,0.000507,0.001726,0.000696,0.000479,0,15/9-14,480.780001
2,0.008961,0.032311,0.949015,0.001131,0.000885,0.002711,0.001151,0.000505,0.000503,0.001712,0.000638,0.000476,0,15/9-14,480.932001
3,0.008565,0.031168,0.950809,0.001047,0.000857,0.002634,0.001133,0.000497,0.000495,0.001685,0.000643,0.000468,0,15/9-14,481.084001
4,0.006072,0.019316,0.967779,0.000757,0.00062,0.001518,0.000841,0.000359,0.000358,0.001218,0.000824,0.000338,0,15/9-14,481.236001


## y_true

In [7]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [8]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [9]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities mean over models

In [10]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [11]:
models_probas_cumsum.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.385404,0.555939,8.848453,0.025241,0.0141,0.07435,0.015223,0.010771,0.007349,0.044246,0.011981,0.006945
1,0.092983,0.323299,9.489286,0.011399,0.008925,0.023019,0.011917,0.005087,0.005073,0.017258,0.006961,0.004794
2,0.089614,0.323113,9.490154,0.011309,0.008854,0.027107,0.011511,0.005047,0.005033,0.017122,0.006379,0.004756
3,0.085654,0.311676,9.508088,0.010472,0.008572,0.026342,0.011325,0.004966,0.004951,0.016845,0.00643,0.004679
4,0.060723,0.19316,9.677793,0.00757,0.006196,0.015178,0.008414,0.00359,0.003579,0.012178,0.008236,0.003383


In [12]:
models_len = len(models_proba['MODEL'].unique())

In [13]:
models_probas_mean = models_probas_cumsum / models_len

In [14]:
models_probas_mean.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.03854,0.055594,0.884845,0.002524,0.00141,0.007435,0.001522,0.001077,0.000735,0.004425,0.001198,0.000694
1,0.009298,0.03233,0.948929,0.00114,0.000892,0.002302,0.001192,0.000509,0.000507,0.001726,0.000696,0.000479
2,0.008961,0.032311,0.949015,0.001131,0.000885,0.002711,0.001151,0.000505,0.000503,0.001712,0.000638,0.000476
3,0.008565,0.031168,0.950809,0.001047,0.000857,0.002634,0.001133,0.000497,0.000495,0.001685,0.000643,0.000468
4,0.006072,0.019316,0.967779,0.000757,0.00062,0.001518,0.000841,0.000359,0.000358,0.001218,0.000824,0.000338


# Lithology with highest cumsum probability per sample

In [15]:
# Turn column names from lith to ordinal
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [16]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [17]:
open_test_score = score(y_true, y_pred)

In [18]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5442
