In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from src.data.download_data import download_from_google_drive
from src.definitions import ROOT_DIR, KEYS_TO_ORDINAL, KEYS_TO_LITHOLOGY
from src.definitions import ORDINAL_TO_KEYS, ORDINAL_TO_LITHOLOGY, LITHOLOGY_TO_ORDINAL
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The model fitting was done in colab using notebook 5.0. Unfortunatelly, the model saving methods `.save()` class method, and pickle, didn't produce the same predicted probabilities using the loaded models as compared to the ones predictied using the in-memory models. The loaded models predicted probabilities had were all the same value.

As a result, I decided to do the fit and predict in colab, and then save the predictions. In this notebook, we load these predictions.

## y_pred

In [3]:
models_proba_path = ROOT_DIR / 'data/raw' / 'models_proba_most_columns_with_nans_no_split_rf.csv'

In [4]:
if models_proba_path.is_file():
    models_proba = pd.read_csv(models_proba_path)
else:
    # Try downloading it from Google drive
    output_root = models_proba_path.parent
    file_id = {models_proba_path.name: '19F0cO5ABlQYdahOHzqfBIJPNuo5B9jIZ'}
    
    try:
        download_from_google_drive(file_id, output_root=output_root)
        models_proba = pd.read_csv(models_proba_path)
    except:
        print('Check if the model proba file exists. If not, run notebook 6.0-rp-fit-predict-save-proba-fillnan-colab on Colab')
        print()
        print('Also, confirm the file id is the same here as shown in Google drive')

Downloading...
From: https://drive.google.com/uc?id=19F0cO5ABlQYdahOHzqfBIJPNuo5B9jIZ
To: /media/hdd/projects/springboard/force_2020_lith/data/raw/models_proba_most_columns_with_nans_no_split_rf.csv
10.4MB [00:00, 25.9MB/s]


In [5]:
models_proba.sample(10)

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,WELL,DEPTH_MD
108138,0.01,0.23,0.66,0.03,0.01,0.06,0.0,0.0,0.0,0.0,0.0,0.0,34/3-3 A,4326.609976
53112,0.1,0.17,0.65,0.0,0.0,0.05,0.0,0.0,0.0,0.03,0.0,0.0,29/3-1,1911.770001
98200,0.04,0.3,0.58,0.03,0.01,0.04,0.0,0.0,0.0,0.0,0.0,0.0,34/3-3 A,2815.577975
49120,0.74,0.14,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29/3-1,1303.314001
9823,0.1,0.08,0.75,0.0,0.0,0.01,0.0,0.0,0.0,0.06,0.0,0.0,15/9-14,1975.092001
93868,0.97,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,34/10-16 R,3957.072008
113885,0.49,0.27,0.13,0.01,0.0,0.09,0.0,0.0,0.0,0.0,0.01,0.0,34/6-1 S,3546.6664
109526,0.02,0.28,0.59,0.04,0.01,0.06,0.0,0.0,0.0,0.0,0.0,0.0,34/3-3 A,4537.585976
12596,0.0,0.04,0.45,0.0,0.0,0.05,0.0,0.0,0.0,0.46,0.0,0.0,15/9-14,2396.588001
64207,0.96,0.03,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,29/3-1,3609.002001


In [6]:
# The sum of all probabilities for each sample should be 1
models_proba.loc[:, 'Sandstone':'Basement'].sum(axis=1).describe()

count    1.367860e+05
mean     1.000000e+00
std      4.655019e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [7]:
models_proba.head()

Unnamed: 0,Sandstone,Sandstone/Shale,Shale,Marl,Dolomite,Limestone,Chalk,Halite,Anhydrite,Tuff,Coal,Basement,WELL,DEPTH_MD
0,0.11,0.11,0.72,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.0,15/9-14,480.628001
1,0.08,0.17,0.7,0.0,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.0,15/9-14,480.780001
2,0.07,0.2,0.66,0.0,0.0,0.06,0.0,0.0,0.0,0.01,0.0,0.0,15/9-14,480.932001
3,0.07,0.14,0.75,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,15/9-14,481.084001
4,0.06,0.12,0.8,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,15/9-14,481.236001


## y_true

In [8]:
csv_open_test_path = ROOT_DIR / 'data/external/open_test_y_true.csv'

csv_open_test = pd.read_csv(csv_open_test_path, ',')

In [9]:
csv_open_test.head()

Unnamed: 0,WELL,DEPTH_MD,FORCE_2020_LITHOFACIES_LITHOLOGY
0,15/9-14,480.628001,65000
1,15/9-14,480.780001,65000
2,15/9-14,480.932001,65000
3,15/9-14,481.084001,65000
4,15/9-14,481.236001,65000


In [10]:
y_true = csv_open_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

# Lithology with highest probability per sample

In [11]:
# Turn column names from lith to ordinal
models_proba = models_proba.loc[:, 'Sandstone':'Basement']
models_proba.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_proba.columns]

In [12]:
y_pred = models_proba.idxmax(axis=1)

# Score

In [13]:
open_test_score = score(y_true, y_pred)

In [14]:
print(f'Olawale modified open test score is: {open_test_score:.4f}')

Olawale modified open test score is: -0.5736
