In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_most_coulmns_with_nans.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '1JKOcJNdwsycBBMbO_wyn-vtrY9cYwheO'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 6.0-rp-fit-predict-save-proba-fillnan-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

Downloading...
From: https://drive.google.com/uc?id=1JKOcJNdwsycBBMbO_wyn-vtrY9cYwheO
To: /media/hdd/projects/springboard/force_2020_lith/data/raw/models_proba_most_coulmns_with_nans.csv
238MB [00:04, 56.7MB/s] 


In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
350851,0.022346,0.100374,0.842469,0.00126,0.004166,0.023558,0.000765,0.000673,0.000673,0.001967,0.001122,0.000629,2,34/10-16 R,1435.088008
1218975,0.040119,0.063908,0.722197,0.027325,0.004285,0.126251,0.002706,0.00245,0.002449,0.003066,0.002955,0.002289,8,35/6-2 S,2502.320467
287762,0.806423,0.048962,0.046356,0.046366,0.003402,0.025731,0.009254,0.001646,0.001646,0.006747,0.00193,0.001538,2,15/9-14,2638.876001
1163029,0.065585,0.657953,0.249235,0.004923,0.002589,0.007717,0.001687,0.001527,0.001694,0.001912,0.003752,0.001427,8,29/3-1,4298.930001
834522,0.042115,0.05181,0.830464,0.019305,0.010368,0.034201,0.002632,0.000944,0.000944,0.00442,0.001916,0.000882,6,15/9-14,2580.508001
622695,0.55247,0.354552,0.059214,0.003492,0.004265,0.009358,0.003392,0.001822,0.001821,0.005731,0.002181,0.001702,4,34/10-16 R,1172.432008
1266255,0.043463,0.042801,0.901011,0.00142,0.001175,0.004731,0.000809,0.000732,0.000732,0.001245,0.001197,0.000684,9,25/5-3,1364.19519
436112,0.189889,0.033363,0.736591,0.012127,0.001584,0.016437,0.001414,0.001008,0.001008,0.004453,0.001182,0.000942,3,25/10-10,1922.2624
1309309,0.00446,0.067664,0.919115,0.000924,0.000912,0.002485,0.000545,0.000494,0.000494,0.001443,0.001002,0.000461,9,34/10-16 R,1580.400008
245878,0.010419,0.097777,0.854504,0.007708,0.001624,0.019761,0.001046,0.000947,0.000947,0.001755,0.002627,0.000885,1,34/3-3 A,4471.617976


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     1.000000e+00
std      3.619639e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,MODEL,WELL,DEPTH_MD
0,0.041309,0.090986,0.840287,0.00305,0.001865,0.009935,0.003236,0.001112,0.000932,0.005021,0.001395,0.000871,0,15/9-14,480.628001
1,0.007278,0.023626,0.960121,0.001064,0.00097,0.002215,0.001241,0.000444,0.000444,0.00146,0.000723,0.000415,0,15/9-14,480.780001
2,0.006796,0.024181,0.958495,0.002041,0.000917,0.003088,0.001171,0.000429,0.000428,0.00141,0.000642,0.0004,0,15/9-14,480.932001
3,0.007087,0.023949,0.958472,0.001848,0.000952,0.002963,0.001215,0.000445,0.000445,0.001463,0.000745,0.000415,0,15/9-14,481.084001
4,0.00607,0.018395,0.964851,0.003354,0.000843,0.001824,0.001077,0.000394,0.000394,0.001296,0.001135,0.000368,0,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities mean over models

In [11]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [12]:
models_probas_cumsum

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.413092,0.909859,8.402868,0.030502,0.018651,0.099355,0.032363,0.011122,0.009320,0.050207,0.013952,0.008709
1,0.072781,0.236261,9.601210,0.010636,0.009695,0.022153,0.012408,0.004437,0.004437,0.014604,0.007231,0.004145
2,0.067963,0.241808,9.584953,0.020413,0.009174,0.030883,0.011712,0.004285,0.004285,0.014104,0.006417,0.004003
3,0.070866,0.239486,9.584724,0.018485,0.009518,0.029635,0.012155,0.004446,0.004446,0.014634,0.007452,0.004154
4,0.060699,0.183951,9.648508,0.033536,0.008431,0.018244,0.010768,0.003938,0.003938,0.012962,0.011347,0.003679
...,...,...,...,...,...,...,...,...,...,...,...,...
136781,7.488011,1.534000,0.453035,0.031502,0.017532,0.359423,0.012811,0.013902,0.011810,0.014517,0.052622,0.010836
136782,7.539843,1.565283,0.393608,0.031647,0.017614,0.334969,0.012871,0.013966,0.011864,0.014584,0.052865,0.010886
136783,7.447477,1.552961,0.481682,0.038332,0.017569,0.344552,0.012838,0.013931,0.012520,0.014547,0.052732,0.010858
136784,7.309926,1.840830,0.482482,0.029823,0.016768,0.211264,0.012253,0.013296,0.011091,0.013884,0.048020,0.010363


In [13]:
models_len = len(models_proba['MODEL'].unique())

In [14]:
models_probas_mean = models_probas_cumsum / models_len

In [15]:
models_probas_mean.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement
0,0.041309,0.090986,0.840287,0.00305,0.001865,0.009935,0.003236,0.001112,0.000932,0.005021,0.001395,0.000871
1,0.007278,0.023626,0.960121,0.001064,0.00097,0.002215,0.001241,0.000444,0.000444,0.00146,0.000723,0.000415
2,0.006796,0.024181,0.958495,0.002041,0.000917,0.003088,0.001171,0.000429,0.000428,0.00141,0.000642,0.0004
3,0.007087,0.023949,0.958472,0.001848,0.000952,0.002963,0.001215,0.000445,0.000445,0.001463,0.000745,0.000415
4,0.00607,0.018395,0.964851,0.003354,0.000843,0.001824,0.001077,0.000394,0.000394,0.001296,0.001135,0.000368


# Lithology with highest cumsum probability per sample

In [16]:
# Turn column names from lith to ordinal
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [17]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [18]:
open_test_score = score(y_true, y_pred)

In [19]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5382
