In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_most_columns_with_nans_y_train_keys.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '1E0NHK5G3ADpVhv3jLwvxQ6kr2xnkSm1F'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 6.0-rp-fit-predict-save-proba-fillnan-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

Downloading...
From: https://drive.google.com/uc?id=1E0NHK5G3ADpVhv3jLwvxQ6kr2xnkSm1F
To: /media/hdd/projects/springboard/force_2020_lith/data/raw/models_proba_most_columns_with_nans_y_train_keys.csv
238MB [00:04, 50.3MB/s] 


In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Shale,Sandstone/Shale,Limestone,Chalk,Dolomite,Marl,Anhydrite,Halite,Coal,Basement,Tuff,MODEL,WELL,DEPTH_MD
44033,0.029132,0.75376,0.197438,0.005763,0.001024,0.001403,0.005253,0.000922,0.000922,0.002369,0.000862,0.001152,0,25/5-3,2709.69919
598953,0.004933,0.932757,0.056628,0.001869,0.000391,0.000641,0.000649,0.000352,0.000352,0.000413,0.000329,0.000687,4,29/3-1,1712.042001
791226,0.012009,0.870432,0.08688,0.011371,0.001201,0.001679,0.007711,0.001082,0.001083,0.003478,0.001012,0.002062,5,34/3-3 A,4198.625976
1192854,0.027032,0.71814,0.116483,0.086393,0.002429,0.003394,0.032907,0.002188,0.002189,0.00263,0.002046,0.004169,8,34/3-3 A,2871.209975
782127,0.005704,0.927726,0.045811,0.007469,0.000873,0.00122,0.006445,0.000786,0.000786,0.000945,0.000735,0.001498,5,34/3-3 A,2815.121976
69555,0.293818,0.610077,0.079208,0.004129,0.001349,0.00197,0.002268,0.001215,0.001215,0.001582,0.001136,0.002032,0,34/10-16 R,261.040008
537337,0.01535,0.875993,0.035157,0.035159,0.001399,0.001956,0.028233,0.001261,0.001261,0.001477,0.001179,0.001575,3,35/6-2 S,2850.704467
1255732,0.012335,0.957651,0.014548,0.006761,0.000906,0.001104,0.001649,0.000612,0.000612,0.000717,0.000572,0.002531,9,25/10-10,1755.6704
1119739,0.981465,0.003873,0.004923,0.002885,0.00088,0.000438,0.002034,0.000291,0.000292,0.00038,0.000273,0.002267,8,25/10-10,1876.2064
963164,0.0052,0.968818,0.017166,0.003204,0.001003,0.000992,0.000797,0.000369,0.000369,0.000674,0.000345,0.001063,7,15/9-14,1342.620001


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+06
mean     9.903565e-01
std      7.123295e-02
min      4.912367e-02
25%      9.974276e-01
50%      9.981949e-01
75%      9.990577e-01
max      9.997907e-01
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Shale,Sandstone/Shale,Limestone,Chalk,Dolomite,Marl,Anhydrite,Halite,Coal,Basement,Tuff,MODEL,WELL,DEPTH_MD
0,0.048974,0.851888,0.072319,0.009912,0.002905,0.001755,0.002834,0.000919,0.00114,0.001387,0.00086,0.005107,0,15/9-14,480.628001
1,0.008252,0.956854,0.026007,0.002047,0.001306,0.001079,0.000969,0.000462,0.000462,0.000697,0.000432,0.001434,0,15/9-14,480.780001
2,0.007822,0.955527,0.026314,0.002775,0.001203,0.001042,0.001878,0.000456,0.000456,0.000687,0.000426,0.001414,0,15/9-14,480.932001
3,0.008955,0.948963,0.029669,0.003263,0.001342,0.001142,0.002828,0.000508,0.000509,0.000767,0.000475,0.001578,0,15/9-14,481.084001
4,0.006531,0.963982,0.019045,0.001688,0.001055,0.000897,0.00329,0.000399,0.0004,0.001098,0.000374,0.00124,0,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Probabilities mean over models

In [11]:
# Container to hold added probabilities across models
models_probas_cumsum = sum((model_proba.loc[:, 'Sandstone':'Basement'].reset_index(drop=True)
                            for model_name, model_proba
                            in models_proba.groupby('MODEL')))

In [12]:
models_probas_cumsum

Unnamed: 0,Sandstone,Shale,Sandstone/Shale,Limestone,Chalk,Dolomite,Marl,Anhydrite,Halite,Coal,Basement
0,0.489735,8.518881,0.723194,0.099116,0.029048,0.017554,0.028342,0.009195,0.011397,0.013872,0.008599
1,0.082520,9.568545,0.260070,0.020472,0.013056,0.010787,0.009689,0.004618,0.004620,0.006965,0.004319
2,0.078219,9.555274,0.263144,0.027746,0.012030,0.010424,0.018777,0.004555,0.004557,0.006870,0.004260
3,0.089554,9.489628,0.296689,0.032632,0.013424,0.011419,0.028285,0.005083,0.005085,0.007666,0.004754
4,0.065313,9.639821,0.190447,0.016878,0.010550,0.008974,0.032902,0.003995,0.003996,0.010984,0.003736
...,...,...,...,...,...,...,...,...,...,...,...
136781,7.783632,0.417175,1.275400,0.375182,0.011238,0.015472,0.028954,0.010123,0.012085,0.048625,0.009467
136782,7.777924,0.400337,1.335715,0.336280,0.011382,0.015928,0.029238,0.010252,0.012240,0.048306,0.009588
136783,7.756688,0.468616,1.253064,0.369941,0.011228,0.015186,0.032527,0.010929,0.012074,0.047652,0.009458
136784,7.594705,0.556657,1.530843,0.169161,0.011244,0.015208,0.029601,0.010128,0.012091,0.048234,0.009472


In [13]:
models_len = len(models_proba['MODEL'].unique())

In [14]:
models_probas_mean = models_probas_cumsum / models_len

In [15]:
models_probas_mean.head()

Unnamed: 0,Sandstone,Shale,Sandstone/Shale,Limestone,Chalk,Dolomite,Marl,Anhydrite,Halite,Coal,Basement
0,0.048974,0.851888,0.072319,0.009912,0.002905,0.001755,0.002834,0.000919,0.00114,0.001387,0.00086
1,0.008252,0.956854,0.026007,0.002047,0.001306,0.001079,0.000969,0.000462,0.000462,0.000697,0.000432
2,0.007822,0.955527,0.026314,0.002775,0.001203,0.001042,0.001878,0.000456,0.000456,0.000687,0.000426
3,0.008955,0.948963,0.029669,0.003263,0.001342,0.001142,0.002828,0.000508,0.000509,0.000767,0.000475
4,0.006531,0.963982,0.019045,0.001688,0.001055,0.000897,0.00329,0.000399,0.0004,0.001098,0.000374


# Lithology with highest cumsum probability per sample

In [16]:
# Turn column names from lith to ordinal
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [17]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [18]:
open_test_score = score(y_true, y_pred)

In [19]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5527
