In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_without_nans.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '1sB1yZObbR5JvRzf-yLoFjETKLRDfRAvA'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 5.0-rp-fit-predict-save-proba-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
1175084,0.005325,0.054878,0.920754,0.002236,0.001133,0.006011,0.000544,0.00048,0.000485,0.007099,0.000599,0.000455,8,34/10-16 R,1969.672008
1079335,0.025004,0.031828,0.855268,0.044863,0.002783,0.02944,0.00185,0.001633,0.001649,0.002128,0.002008,0.001548,7,35/6-2 S,2068.512467
798864,0.007651,0.158132,0.822146,0.00216,0.001061,0.004015,0.00071,0.000627,0.000633,0.000868,0.001403,0.000594,5,34/6-1 S,3708.3944
1273770,0.026714,0.09542,0.859482,0.004839,0.001331,0.005682,0.000909,0.000802,0.00081,0.001045,0.002208,0.00076,9,25/5-3,2506.47519
1022301,0.739228,0.168851,0.035751,0.00644,0.007343,0.02287,0.001079,0.000952,0.000961,0.001241,0.014382,0.000902,7,29/3-1,3698.986001
1231268,0.286815,0.197553,0.442324,0.003764,0.003295,0.007985,0.005684,0.001716,0.00179,0.005935,0.041514,0.001626,9,15/9-14,510.116001
1185461,0.002924,0.048734,0.942064,0.001083,0.000616,0.001668,0.000417,0.000368,0.000372,0.00048,0.000925,0.000349,8,34/10-16 R,3546.976008
532365,0.020402,0.030847,0.889006,0.02173,0.002453,0.026032,0.00163,0.001439,0.001453,0.001875,0.001769,0.001364,3,35/6-2 S,2094.960467
919333,0.018004,0.118051,0.823825,0.01596,0.002531,0.011671,0.001683,0.001485,0.0015,0.002057,0.001826,0.001407,6,34/3-3 A,2878.961975
52073,0.1739,0.021504,0.051228,0.00194,0.001141,0.021348,0.001262,0.000577,0.000606,0.725014,0.000935,0.000547,0,29/3-1,1753.842001


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     1.000000e+00
std      3.611896e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
0,0.135053,0.1451,0.609656,0.005653,0.009861,0.033358,0.029547,0.00822,0.005458,0.010135,0.005396,0.002562,0,15/9-14,480.628001
1,0.008331,0.025157,0.958146,0.000981,0.000782,0.002177,0.001262,0.000381,0.000397,0.001399,0.000625,0.000361,0,15/9-14,480.780001
2,0.009238,0.027432,0.95416,0.001013,0.000801,0.002824,0.001293,0.00039,0.000407,0.001433,0.00064,0.00037,0,15/9-14,480.932001
3,0.010348,0.027173,0.953974,0.000925,0.00072,0.002627,0.001207,0.000364,0.00038,0.001338,0.000598,0.000345,0,15/9-14,481.084001
4,0.00505,0.018408,0.970629,0.000564,0.000509,0.001523,0.000885,0.000258,0.000269,0.000946,0.000716,0.000244,0,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities cumulative sum over models

In [11]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [12]:
models_probas_cumsum

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,1.350526,1.450997,6.096564,0.056532,0.098615,0.333576,0.295469,0.082205,0.054583,0.101351,0.053963,0.025621
1,0.083312,0.251573,9.581460,0.009809,0.007819,0.021772,0.012621,0.003809,0.003974,0.013988,0.006253,0.003610
2,0.092380,0.274323,9.541596,0.010126,0.008007,0.028243,0.012926,0.003901,0.004070,0.014326,0.006404,0.003697
3,0.103483,0.271732,9.539744,0.009249,0.007201,0.026265,0.012071,0.003643,0.003801,0.013379,0.005980,0.003453
4,0.050498,0.184082,9.706286,0.005636,0.005092,0.015228,0.008853,0.002576,0.002688,0.009461,0.007158,0.002442
...,...,...,...,...,...,...,...,...,...,...,...,...
136781,7.531906,1.692742,0.307381,0.019543,0.011075,0.373994,0.007523,0.006640,0.006955,0.008653,0.027295,0.006293
136782,7.546520,1.691414,0.293449,0.019649,0.011412,0.373854,0.007564,0.006676,0.006992,0.008699,0.027442,0.006327
136783,7.646067,1.524294,0.342105,0.020803,0.011428,0.387859,0.008008,0.007069,0.007403,0.009211,0.029055,0.006699
136784,7.435501,1.923569,0.331606,0.020092,0.011037,0.213313,0.007735,0.006827,0.006893,0.008896,0.028062,0.006470


# Lithology with highest cumsum probability per sample

In [13]:
# Turn column names from lith to ordinal
models_probas_cumsum.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_cumsum.columns]

In [14]:
y_pred = models_probas_cumsum.idxmax(axis=1)

# Score

In [15]:
open_test_score = score(y_true, y_pred)

In [16]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5460


Replacing missing values with -999 didn't improve the score.