In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_most_coulmns_with_nans_no_split.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '1ADjhoZKrd-WamPvrNwAdLy-a6M_fNcSq'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 6.0-rp-fit-predict-save-proba-fillnan-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,WELL,DEPTH_MD
27428,0.005726,0.015645,0.915396,0.048197,0.000918,0.010942,0.000609,0.00048,0.000479,0.000605,0.000555,0.000449,25/10-10,2176.7104
24158,0.017057,0.008956,0.960784,0.001403,0.001301,0.004907,0.000786,0.000595,0.000593,0.002373,0.000689,0.000556,25/10-10,1679.6704
37518,0.020983,0.023916,0.93444,0.002184,0.001735,0.008764,0.001292,0.001155,0.001152,0.001901,0.001399,0.00108,25/5-3,1719.41919
41084,0.894094,0.019961,0.02127,0.018348,0.001578,0.038801,0.000919,0.000761,0.000759,0.001915,0.000881,0.000712,25/5-3,2261.45119
126450,0.021473,0.075899,0.835245,0.022744,0.002832,0.030086,0.002027,0.001811,0.001807,0.002285,0.002097,0.001694,35/6-2 S,2770.296467
107496,0.011826,0.119219,0.823095,0.009729,0.002081,0.021544,0.00149,0.001331,0.001328,0.002699,0.004415,0.001245,34/3-3 A,4229.025976
73274,0.137986,0.391794,0.454502,0.002043,0.001619,0.004309,0.001209,0.00108,0.001078,0.00175,0.00162,0.00101,34/10-16 R,826.328008
98660,0.09439,0.212602,0.329974,0.059509,0.011125,0.272025,0.003061,0.002735,0.002729,0.005545,0.003747,0.002558,34/3-3 A,2885.497975
102866,0.036182,0.10516,0.657076,0.026102,0.008147,0.14732,0.002286,0.002042,0.002037,0.00414,0.007598,0.00191,34/3-3 A,3524.809976
104293,0.023064,0.183413,0.733945,0.018543,0.002383,0.024328,0.001706,0.001524,0.001521,0.003091,0.005056,0.001426,34/3-3 A,3741.713975


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+05
mean     1.000000e+00
std      3.606714e-08
min      9.999999e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,WELL,DEPTH_MD
0,0.045878,0.076793,0.851378,0.002922,0.001854,0.009912,0.001754,0.001333,0.00099,0.004697,0.001561,0.000929,15/9-14,480.628001
1,0.011491,0.026447,0.94993,0.001374,0.001208,0.002999,0.001669,0.000605,0.000604,0.002186,0.000922,0.000566,15/9-14,480.780001
2,0.012069,0.029862,0.94346,0.002315,0.00127,0.004159,0.001755,0.000636,0.000635,0.002298,0.000946,0.000595,15/9-14,480.932001
3,0.010736,0.028815,0.947129,0.002395,0.001121,0.003636,0.001549,0.000562,0.00056,0.002029,0.000943,0.000525,15/9-14,481.084001
4,0.008422,0.021015,0.958619,0.003395,0.000953,0.002129,0.001317,0.000478,0.000476,0.001725,0.001023,0.000447,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Lithology with highest probability per sample

In [11]:
# Turn column names from lith to ordinal
models_proba = models_proba.loc[:, 'Sandstone':'Basement']
models_proba.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_proba.columns]

In [12]:
y_pred = models_proba.idxmax(axis=1)

# Score

In [13]:
open_test_score = score(y_true, y_pred)

In [14]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5385
