In [1]:
import numpy as np
import pandas as pd

from src.definitions import ROOT_DIR

from src.data.download_data import download_competition_files
from src.model.train_model import score

In [2]:
%load_ext autoreload
%autoreload 2

# Import data

The competition winner prediction on the hidden test data is not found on the competition data folder (GitHub or Google Drive). To create it, I pointed Colab to the [winner notebook](https://github.com/bolgebrygg/Force-2020-Machine-Learning-competition/blob/master/lithology_competition/code/OlawaleI/FORCE_Submission_File.ipynb) on GitHub, and edited it to run with the hidden test data. The result is notebook: 2.0-rp-competition-winner-olawale-raw-colab-hidden-test. It works as-is in Colab, but it struggles to run on my local machine (possibly GPU misconfiguration). Note: My notebooks with `colab` on their name are meant to be run in Colab.

Also, I updated the data input folder to point to the [competition shared Google Drive folder](https://drive.google.com/drive/folders/1GIkjq4fwgwbiqVQxYwoJnOJWVobZ91pL) by adding a shorcut to my personal Google Drive. This way I can access the data without duplicating cloud storage.

Finally, I created a [Google Drive shared folder](https://drive.google.com/drive/folders/1ilFw-gfCSbvRjkbEDygTjuxN-Ixa3wxg) named `lith_pred` to keep the results of the Colab runs.

In this notebook, I explore the result of running the winning code on the hidden test data, in an attemp to reproduce and test the scoring function.

## Lithology mapping

In [3]:
lithology_numbers = {30000: 0,
                     65030: 1,
                     65000: 2,
                     80000: 3,
                     74000: 4,
                     70000: 5,
                     70032: 6,
                     88000: 7,
                     86000: 8,
                     99000: 9,
                     90000: 10,
                     93000: 11}

## Olawale hidden y_pred

In [4]:
output_root = ROOT_DIR / 'data/external'
olawale_hidden_test_pred_path = output_root / 'olawale_hidden_test_pred.csv'

In [5]:
if not olawale_hidden_test_pred_path.is_file():
    download_competition_files()

In [6]:
olawale_hidden_test_pred = pd.read_csv(olawale_hidden_test_pred_path)

In [7]:
olawale_hidden_test_pred.head()

Unnamed: 0,# lithology
0,65000
1,65000
2,65000
3,65000
4,65000


In [8]:
y_pred = olawale_hidden_test_pred['# lithology'].map(lithology_numbers).values.ravel()

y_pred

array([2, 2, 2, ..., 1, 2, 2])

## y_true

In [9]:
csv_hidden_test_path = ROOT_DIR / 'data/external/CSV_hidden_test.csv'

csv_hidden_test = pd.read_csv(csv_hidden_test_path, ';')

In [10]:
csv_hidden_test.head()

Unnamed: 0,WELL,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,GROUP,FORMATION,CALI,RSHA,RMED,...,ROP,DTS,DCAL,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,FORCE_2020_LITHOFACIES_LITHOLOGY,FORCE_2020_LITHOFACIES_CONFIDENCE
0,15/9-23,1518.28,433906.75,6460000.5,-1493.241821,HORDALAND GP.,Skade Fm.,15.506232,,,...,146.526276,326.451263,-1.993768,0.109706,,,88.968864,,65000,3.0
1,15/9-23,1518.432,433906.75,6460000.5,-1493.393799,HORDALAND GP.,Skade Fm.,18.524611,,,...,147.605148,322.926361,1.024611,-0.006418,,,92.287186,,65000,3.0
2,15/9-23,1518.584,433906.75,6460000.5,-1493.545776,HORDALAND GP.,Skade Fm.,18.855669,,,...,140.783127,325.283142,1.355668,0.022769,,,95.605499,,65000,3.0
3,15/9-23,1518.736,433906.75,6460000.5,-1493.697754,HORDALAND GP.,Skade Fm.,19.163353,,,...,125.159531,334.233185,1.663353,0.024972,,,98.92382,,65000,3.0
4,15/9-23,1518.888,433906.75,6460000.5,-1493.849609,HORDALAND GP.,Skade Fm.,18.489744,,0.849849,...,107.576691,330.952362,0.989743,0.024527,,,102.242142,,65000,3.0


In [11]:
y_true = csv_hidden_test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_numbers).values.ravel()

# Score

In [12]:
olawale_hidden_test_score = score(y_true, y_pred)

In [14]:
print(f'Olawale hidden test score is: {olawale_hidden_test_score:.4f}')

Olawale hidden test score is: -0.4710


This is a little higher compared to the reported -0.4690 final test score. This could be due to the randomness of the process.