In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_without_nans.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '1sB1yZObbR5JvRzf-yLoFjETKLRDfRAvA'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 6.0-rp-fit-predict-save-proba-fillnan-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
290350,0.029527,0.019906,0.022319,0.156235,0.009184,0.607285,0.140187,0.003666,0.002903,0.00298,0.00364,0.002167,2,15/9-14,3032.252001
365491,0.004161,0.048976,0.938729,0.001423,0.000792,0.002323,0.000536,0.000473,0.000478,0.000617,0.001043,0.000449,2,34/10-16 R,3660.368008
682369,0.003876,0.895281,0.090603,0.002418,0.000827,0.00352,0.00058,0.000512,0.000517,0.000667,0.000714,0.000485,4,35/9-8,2948.6616
1178734,0.005458,0.017565,0.966201,0.003301,0.000772,0.003958,0.000468,0.000413,0.000417,0.000539,0.000516,0.000392,8,34/10-16 R,2524.472008
210647,0.498415,0.394759,0.080346,0.002936,0.002906,0.006877,0.002488,0.001568,0.001647,0.003597,0.002974,0.001486,1,34/10-16 R,915.552008
252083,0.015026,0.375294,0.589837,0.003673,0.001805,0.006144,0.001208,0.001066,0.001076,0.001476,0.002386,0.00101,1,34/6-1 S,3763.5704
1084411,0.049757,0.078211,0.30278,0.125912,0.005054,0.418424,0.003524,0.002965,0.003054,0.003864,0.003645,0.00281,7,35/6-2 S,2840.064467
373524,0.011342,0.057456,0.8842,0.01129,0.001833,0.026638,0.001218,0.001075,0.001086,0.001489,0.001354,0.001019,2,34/3-3 A,3081.881975
1299287,0.070626,0.653114,0.240842,0.005488,0.003188,0.01264,0.001919,0.001694,0.002437,0.002207,0.00424,0.001605,9,29/3-1,4218.674001
1319135,0.086141,0.0818,0.740582,0.014027,0.003039,0.014443,0.001653,0.001459,0.001473,0.001901,0.052099,0.001383,9,34/10-16 R,3073.952008


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     1.000000e+00
std      3.611896e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
0,0.135053,0.1451,0.609656,0.005653,0.009861,0.033358,0.029547,0.00822,0.005458,0.010135,0.005396,0.002562,0,15/9-14,480.628001
1,0.008331,0.025157,0.958146,0.000981,0.000782,0.002177,0.001262,0.000381,0.000397,0.001399,0.000625,0.000361,0,15/9-14,480.780001
2,0.009238,0.027432,0.95416,0.001013,0.000801,0.002824,0.001293,0.00039,0.000407,0.001433,0.00064,0.00037,0,15/9-14,480.932001
3,0.010348,0.027173,0.953974,0.000925,0.00072,0.002627,0.001207,0.000364,0.00038,0.001338,0.000598,0.000345,0,15/9-14,481.084001
4,0.00505,0.018408,0.970629,0.000564,0.000509,0.001523,0.000885,0.000258,0.000269,0.000946,0.000716,0.000244,0,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities mean over models

In [11]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [12]:
models_probas_cumsum

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,1.350526,1.450997,6.096564,0.056532,0.098615,0.333576,0.295469,0.082205,0.054583,0.101351,0.053963,0.025621
1,0.083312,0.251573,9.581460,0.009809,0.007819,0.021772,0.012621,0.003809,0.003974,0.013988,0.006253,0.003610
2,0.092380,0.274323,9.541596,0.010126,0.008007,0.028243,0.012926,0.003901,0.004070,0.014326,0.006404,0.003697
3,0.103483,0.271732,9.539744,0.009249,0.007201,0.026265,0.012071,0.003643,0.003801,0.013379,0.005980,0.003453
4,0.050498,0.184082,9.706286,0.005636,0.005092,0.015228,0.008853,0.002576,0.002688,0.009461,0.007158,0.002442
...,...,...,...,...,...,...,...,...,...,...,...,...
136781,7.531906,1.692742,0.307381,0.019543,0.011075,0.373994,0.007523,0.006640,0.006955,0.008653,0.027295,0.006293
136782,7.546520,1.691414,0.293449,0.019649,0.011412,0.373854,0.007564,0.006676,0.006992,0.008699,0.027442,0.006327
136783,7.646067,1.524294,0.342105,0.020803,0.011428,0.387859,0.008008,0.007069,0.007403,0.009211,0.029055,0.006699
136784,7.435501,1.923569,0.331606,0.020092,0.011037,0.213313,0.007735,0.006827,0.006893,0.008896,0.028062,0.006470


In [13]:
models_len = len(models_proba['MODEL'].unique())

In [14]:
models_probas_mean = models_probas_cumsum / models_len

In [15]:
models_probas_mean.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.135053,0.1451,0.609656,0.005653,0.009861,0.033358,0.029547,0.00822,0.005458,0.010135,0.005396,0.002562
1,0.008331,0.025157,0.958146,0.000981,0.000782,0.002177,0.001262,0.000381,0.000397,0.001399,0.000625,0.000361
2,0.009238,0.027432,0.95416,0.001013,0.000801,0.002824,0.001293,0.00039,0.000407,0.001433,0.00064,0.00037
3,0.010348,0.027173,0.953974,0.000925,0.00072,0.002627,0.001207,0.000364,0.00038,0.001338,0.000598,0.000345
4,0.00505,0.018408,0.970629,0.000564,0.000509,0.001523,0.000885,0.000258,0.000269,0.000946,0.000716,0.000244


# Lithology with highest cumsum probability per sample

In [16]:
# Turn column names from lith to ordinal
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [17]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [18]:
open_test_score = score(y_true, y_pred)

In [19]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5460


Replacing missing values with -999 didn't improve the score.