In [None]:
from pathlib import Path
import pickle

import numpy as np
import numpy.random as nr

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# Print out packages versions
print(f'pandas version is: {pd.__version__}')
print(f'numpy version is: {np.__version__}')
print(f'matplotlib version is: {matplotlib.__version__}')
print(f'sklearn version is: {sklearn.__version__}')
print(f'xgboost version is: {xgb.__version__}')

pandas version is: 1.1.5
numpy version is: 1.19.5
matplotlib version is: 3.2.2
sklearn version is: 0.22.2.post1
xgboost version is: 0.90


# Helper functions

In [None]:
def replace_nan_inf(df, value=None):
    """
    Replace missing and infinity values.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with values to be replaced.

    value : int, float
        Value to replace any missing or numpy.inf values. Defaults to numpy.nan
    Returns
    -------
    pandas.DataFrame
        Dataframe with missing and infinity values replaced with -999.

    """
    if value is None:
        value = np.nan
    return df.replace(to_replace=[np.nan, np.inf, -np.inf],
                      value=value)


def shift_concat(df, periods=1, fill_value=None):
    """
    Build dataframe of shifted index.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with columns to be shifted.
    periods : int
        Number of periods to shift. Should be positive.
    fill_value : object, optional
        The scalar value to use for newly introduced missing values. Defaults
        to numpy.nan.

    Returns
    -------
    pandas.DataFrame
        Shifted dataframes concatenated along columns axis.

    Notes
    -------
    Based on Paulo Bestagini's augment_features_window from SEG 2016 ML
    competition.
    https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try01.ipynb

    Example
    -------
    Shift df by one period and concatenate.

    >>> df = pd.DataFrame({'gr': [1.1, 2.1], 'den': [2.1, 2.2]})
    >>> shift_concat(df)
        gr_shifted_1  den_shifted_1  gr   den  gr_shifted_-1   den_shifted_-1
    0      NaN            NaN        1.1  2.1      2.1             2.2
    1      1.1            2.1        2.1  2.2      NaN             NaN

    """
    if fill_value is None:
        fill_value = np.nan

    dfs = []
    for period in range(periods, -1*periods - 1, -1):

        if period == 0:
            dfs.append(df)
            continue

        df_shifted = df.shift(period, fill_value=fill_value)

        df_shifted.columns = [f'{col}_shifted_{str(period)}'
                              for col in df_shifted.columns]

        dfs.append(df_shifted)

    return pd.concat(dfs, axis=1)


def gradient(df, depth_col):
    """
    Calculate the gradient for all features along the provided `depth_col`
    column.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with columns to be used in the gradient calculation.
    depth_col : str
        Dataframe column name to be used as depth reference.

    Returns
    -------
    pandas.DataFrame
        Gradient of `df` along `depth_col` column. The depth column is not in
        the output dataframe.

    Notes
    -------
    Based on Paulo Bestagini's augment_features_window from SEG 2016 ML
    competition.
    https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try01.ipynb

    Example
    -------
    Calculate gradient of columns along `md`.

    >>> df = pd.DataFrame({'gr': [100.1, 100.2, 100.3],
                          'den': [2.1, 2.2, 2.3],
                          'md': [500, 500.5, 501]})
    >>> gradient(df, 'md')
        gr  den
    0  NaN  NaN
    1  0.2  0.2
    2  0.2  0.2

    """
    depth_diff = df[depth_col].diff()

    denom_zeros = np.isclose(depth_diff, 0)
    depth_diff[denom_zeros] = 0.001

    df_diff = df.drop(depth_col, axis=1)
    df_diff = df_diff.diff()

    # Add suffix to column names
    df_diff.columns = [f'{col}_gradient' for col in df_diff.columns]

    return df_diff.divide(depth_diff, axis=0)


def shift_concat_gradient(df, depth_col, well_col, cat_cols, periods=1, fill_value=None):
    """
    Augment features using `shif_concat` and `gradient`.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with columns to be augmented.
    depth_col : str
        Dataframe column name to be used as depth reference.
    well_col : str
        Dataframe column name to be used as well reference.
    cat_cols: list of str
        Encoded column names. The gradient calculation is not applied to these
        columns.
    periods : int
        Number of periods to shift. Should be positive.
    fill_value : object, optional
        The scalar value to use for newly introduced missing values. Defaults
        to numpy.nan.

    Returns
    -------
    pandas.DataFrame
        Augmented dataframe.

    Notes
    -------
    Based on Paulo Bestagini's augment_features_window from SEG 2016 ML
    competition.
    https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try01.ipynb

    Example
    -------
    Augment features of `df` by shifting and taking the gradient.

    >>> df = pd.DataFrame({'gr': [100.1, 100.2, 100.3, 20.1, 20.2, 20.3],
                          'den': [2.1, 2.2, 2.3, 1.7, 1.8, 1.9],
                           'md': [500, 500.5, 501, 1000, 1000.05, 1001],
                         'well': [1, 1, 1, 2, 2, 2]})
    >>> shift_concat_gradient(df, 'md', 'well', periods=1, fill_value=None)
        gr_shifted_1  den_shifted_1     gr    den  ...  well   md    gr_gradient  den_gradient
    0         NaN          NaN         100.1  2.1  ...   1   500.00        NaN           NaN
    1       100.1          2.1         100.2  2.2  ...   1   500.50   0.200000      0.200000
    2       100.2          2.2         100.3  2.3  ...   1   501.00   0.200000      0.200000
    3         NaN          NaN          20.1  1.7  ...   2  1000.00        NaN           NaN
    4        20.1          1.7          20.2  1.8  ...   2  1000.05   2.000000      2.000000
    5        20.2          1.8          20.3  1.9  ...   2  1001.00   0.105263      0.105263

    """
    # TODO 'Consider filling missing values created here with DataFrame.fillna'

    # Columns to apply gradient operation
    cat_cols.append(well_col)
    gradient_cols = [col for col in df.columns if col not in cat_cols]

    # Don't shift depth
    depth = df.loc[:, depth_col]

    grouped = df.groupby(well_col, sort=False)

    df_aug_groups = []
    for name, group in grouped:
        shift_cols_df = group.drop([well_col, depth_col], axis=1)

        group_shift = shift_concat(shift_cols_df,
                                   periods=periods,
                                   fill_value=fill_value)

        # Add back the well name and depth
        group_shift[well_col] = name
        group_shift[depth_col] = depth

        group_gradient = group.loc[:, gradient_cols]

        group_gradient = gradient(group_gradient, depth_col)

        group_aug = pd.concat([group_shift, group_gradient], axis=1)

        df_aug_groups.append(group_aug)

    return pd.concat(df_aug_groups)


def score(y_true, y_pred, scoring_matrix):
    """
    Competition scoring function.

    Parameters
    ----------
    y_true : pandas.Series
        Ground truth (correct) target values.
    y_pred : pandas.Series
        Estimated targets as returned by a classifier.
    scoring_matrix : numpy.array
        Competition scoring matrix.

    Returns
    ----------
    float
        2020 FORCE ML lithology competition custome score.

    """
    S = 0.0

    for true_val, pred_val in zip(y_true, y_pred):
        S -= scoring_matrix[true_val, pred_val]

    return S/y_true.shape[0]


def show_evaluation(y_true, y_pred):
    """
    Print model performance and evaluation.

    Parameters
    ----------
    y_true : pandas.Series
        Ground truth (correct) target values.
    y_pred: pandas.Series
        Estimated targets as returned by a classifier.

    """
    print(f'Competition score: {score(y_true, y_pred)}')
    print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
    print(f'F1: {f1_score(y_true, y_pred, average="weighted")}')


def build_encoding_map(series):
    """
    Build dictionary with the mapping of series unique values to encoded
    values.

    Parameters
    ----------
    series : pandas.Series
        Series with categories to be encoded.

    Returns
    -------
    mapping : dict
        Dictionary mapping unique categories in series to encoded values.

    See Also
    --------
    label_encode_columns : Label encode a dataframe categorical columns.

    """
    unique_values = series.unique()

    mapping = {original: encoded
               for encoded, original in enumerate(unique_values)
               if original is not np.nan}

    return mapping


def label_encode_columns(df, cat_cols, mappings):
    """
    Label encode a dataframe categorical columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with columns to be encoded.
    cat_cols: list of str
        Column names to be encoded.
    mappings: dict of dict
        Dictionary containing a key-value mapping for each column to be
        encoded.

    Returns
    -------
    df : pandas.DataFrame
        Dataframe with the encoded columns added and the `cat_cols` removed.
    encoded_col_names: list of str
        Encoded column names.

    See Also
    --------
    build_encoding_map : Build a series encoding mapping.

    """
    df = df.copy()

    encoded_col_names = []
    for col in cat_cols:
        new_col = f'{col}_encoded'
        encoded_col_names.append(new_col)

        df[new_col] = df[col].map(mappings[col])

        df.drop(col, axis=1, inplace=True)

    return df, encoded_col_names

# Target maps

In [None]:
KEYS_TO_ORDINAL = {
    30000: 0,
    65030: 1,
    65000: 2,
    80000: 3,
    74000: 4,
    70000: 5,
    70032: 6,
    88000: 7,
    86000: 8,
    99000: 9,
    90000: 10,
    93000: 11
    }


KEYS_TO_LITHOLOGY = {30000: 'Sandstone',
                     65030: 'Sandstone/Shale',
                     65000: 'Shale',
                     80000: 'Marl',
                     74000: 'Dolomite',
                     70000: 'Limestone',
                     70032: 'Chalk',
                     88000: 'Halite',
                     86000: 'Anhydrite',
                     99000: 'Tuff',
                     90000: 'Coal',
                     93000: 'Basement'}

ORDINAL_TO_KEYS = {value: key for key, value in  KEYS_TO_ORDINAL.items()}

ORDINAL_TO_LITHOLOGY = {}
for ordinal_key, key in ORDINAL_TO_KEYS.items():
    ORDINAL_TO_LITHOLOGY[ordinal_key] = KEYS_TO_LITHOLOGY[key]

LITHOLOGY_TO_ORDINAL = {}
for ordinal_key, lithology in ORDINAL_TO_LITHOLOGY.items():
    LITHOLOGY_TO_ORDINAL[lithology] = ordinal_key

# Import data

First add a shortcut from the [google drive competition data location](https://drive.google.com/drive/folders/1GIkjq4fwgwbiqVQxYwoJnOJWVobZ91pL) to your own google drive. We will mount this drive, and access the data from it.

We will save the results to a diffent folder, where we have write access.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#should be edited to the present working directory of the user
data_source = '/content/drive/My Drive/FORCE 2020 lithofacies prediction from well logs competition/'

In [None]:
penalty_matrix = np.load(data_source + 'penalty_matrix.npy')

train = pd.read_csv(data_source + 'CSV_train.csv', sep=';')

test = pd.read_csv(data_source + 'CSV_hidden_test.csv', sep=';')

In [None]:
# Destination folder
out_data_dir = Path('/content/drive/My Drive/lith_pred/')

# Train model

In [None]:
class Model():
    '''
    class to lithology prediction
    '''
    def preprocess(self, df, cat_columns, mappings):

        # # Drop model features
        # drop_cols = [
        #              'FORCE_2020_LITHOFACIES_CONFIDENCE',
        #              'SGR',
        #              'DTS',
        #              'DCAL',
        #              'RMIC',
        #              'ROPA',
        #              'RXO',                     
        #              ]

        # # Confirm drop columns are in df
        # drop_cols = [col for col in drop_cols if col in df.columns]

        # df.drop(drop_cols, axis=1, inplace=True)

        # Label encode
        df, encoded_col_names = label_encode_columns(df, cat_columns, mappings)
        
        # Augment using Bestagini's functions
        df_preprocesed = shift_concat_gradient(df,
                                               'DEPTH_MD',
                                               'WELL',
                                               encoded_col_names,
                                               periods=1,
                                               fill_value=None)
       
        return df_preprocesed

    def fit(self, X, y):
        split = 5
        skf = StratifiedKFold(n_splits=split, shuffle=True)

        model = XGBClassifier(n_estimators=100, max_depth=10, booster='gbtree',
                              objective='multi:softprob', learning_rate=0.1, random_state=0,
                              subsample=0.9, colsample_bytree=0.9, tree_method='gpu_hist',
                              eval_metric='mlogloss', verbose=2020, reg_lambda=1500)
        
        models = []
        for fold_number, indices in enumerate(skf.split(X, y)):
            print(f'Fitting fold: {fold_number}')
            
            train_index, test_index = indices

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train,
                      y_train,
                      early_stopping_rounds=100,
                      eval_set=[(X_test, y_test)],
                      verbose=100)

            models.append(model)

        return models

    def fit_predict(self, X_train, y_train, X_pred, pred_wells, save_filename):
        # Fit
        models = self.fit(X_train, y_train)

        # Get lithologies probabilities for each model
        models_proba = []
        for model_num, model in enumerate(models):
            model_proba = model.predict_proba(X_pred)
            model_classes = [ORDINAL_TO_LITHOLOGY[lith] for lith in model.classes_]

            model_proba_df = pd.DataFrame(model_proba, columns=model_classes)

            model_proba_df['MODEL'] = model_num


            # Set sample index, well, and MD
            pred_wells_df = pred_wells.reset_index()
            model_proba_df['index'] = pred_wells_df['index']
            model_proba_df['WELL'] = pred_wells_df['WELL']

            md = X_pred['DEPTH_MD']
            md.reset_index(inplace=True, drop=True)
            model_proba_df['DEPTH_MD'] = md

            models_proba.append(model_proba_df)

        models_proba = pd.concat(models_proba, ignore_index=True)

        # Create save directory if it doesn't exists
        if not save_filename.parent.is_dir():
            save_filename.parent.mkdir(parents=True)

        # Save models_proba to CSV
        models_proba.to_csv(save_filename, index=False)

        return models, models_proba

# Prepare train data

In [None]:
# Build group of groups
group_of_groups = {
    'VIKING GP.': 'VTB GP.',
    'BOKNFJORD GP.': 'VTB GP.',
    'TYNE GP.': 'VTB GP.',
    'ROTLIEGENDES GP.': 'PERMIAN GP.',
    'ZECHSTEIN GP.': 'PERMIAN GP.',
    }

train['GROUPED'] = train['GROUP']
train['GROUPED'].replace(group_of_groups, inplace=True)

train.drop('GROUP', axis=1, inplace=True)

In [None]:
train['FORCE_2020_LITHOFACIES_LITHOLOGY'] = train['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

In [None]:
cat_columns = ['FORMATION']

train_mappings = {col: build_encoding_map(train[col]) for col in cat_columns}

In [None]:
# Drop columns with high percent of missing values
drop_cols = [
            'FORCE_2020_LITHOFACIES_CONFIDENCE',
            'SGR',
            'DTS',
            'DCAL',
            'RMIC',
            'ROPA',
            'RXO',                     
            ]

# Confirm drop columns are in df
drop_cols = [col for col in drop_cols if col in train.columns]

train.drop(drop_cols, axis=1, inplace=True)

In [None]:
# Use different logs per group
limit = 0.68
keep_logs_per_group = {}
for group_data in train.groupby('GROUPED'):
    group_name, group = group_data
    
    group_log_coverage = (~group.isna()).sum() / group.shape[0]
    
    cond_more_than_limit = group_log_coverage > limit
    
    keep_logs = [log for log, val in cond_more_than_limit.items() if val]

    if 'FORMATION' not in keep_logs:
        keep_logs.append('FORMATION')
    
    keep_logs_per_group[group_name] = keep_logs

# Prepare predict data

In [None]:
y_true = test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(KEYS_TO_ORDINAL)

In [None]:
test['GROUPED'] = test['GROUP']
test['GROUPED'].replace(group_of_groups, inplace=True)

test.drop(['GROUP', 'FORCE_2020_LITHOFACIES_LITHOLOGY'], axis=1, inplace=True)

In [None]:
# Confirm drop columns are in df
test_drop_cols = [col for col in drop_cols if col in test.columns]

test.drop(test_drop_cols, axis=1, inplace=True)

In [None]:
test.columns

Index(['WELL', 'DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'FORMATION', 'CALI',
       'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS',
       'ROP', 'DRHO', 'MUDWEIGHT', 'GROUPED'],
      dtype='object')

# Fit predict groups

In [None]:
models_probas_list = []
for group_data in test.groupby('GROUPED'):
    group_name, group = group_data

    if group_name in train['GROUPED'].unique():
        # Select train group features
        keep_cols = keep_logs_per_group[group_name]
        group_train = train.loc[train['GROUPED']==group_name, keep_cols]
        group_train.drop(['GROUPED'], axis=1, inplace=True)

        # Drop lithofacies with less than n_split samples
        train_group_vc = group_train['FORCE_2020_LITHOFACIES_LITHOLOGY'].value_counts()

        for lith, count in train_group_vc.items():
            if count <= 5:
                cond = group_train['FORCE_2020_LITHOFACIES_LITHOLOGY'] != lith
                group_train = group_train.loc[cond, :]

        model = Model()

        # Define train target and features
        y_train = group_train['FORCE_2020_LITHOFACIES_LITHOLOGY']
    
        X = group_train.drop('FORCE_2020_LITHOFACIES_LITHOLOGY', axis=1)
        
        # Augment train features
        X_train = model.preprocess(X, cat_columns, train_mappings)

        X_train.drop('WELL', axis=1, inplace=True)

        # Define predict features
        pred_keep_cols = [col for col in keep_cols if col != 'FORCE_2020_LITHOFACIES_LITHOLOGY']
        X_pred = group.loc[:, pred_keep_cols]
        X_pred.drop(['GROUPED'], axis=1, inplace=True)
        
        # Augment predict features
        X_pred = model.preprocess(X_pred, cat_columns, train_mappings)

        X_pred.drop('WELL', axis=1, inplace=True)


        fn = '_'.join(group_name.lower().split())
        save_filename = out_data_dir / f'model_proba/grouped/00_hidden/models_proba_grouped_{fn}csv'

        predict_group_wells = group['WELL']

        print(f'Fitting group: {group_name}')
        models, models_proba = model.fit_predict(X_train,
                                                 y_train,
                                                 X_pred,
                                                 predict_group_wells,
                                                 save_filename)
        
        models_probas_list.append(models_proba)

        # print(X_pred.shape)
        # print()
        # print(predict_group_wells.shape)
        # print(predict_group_wells.head())
        # print(predict_group_wells.tail())
        # print()


    else:
        print('Group {group_name} is in the prediction set')
        print('but not in the train set')
        print('This functionality is currently not supported')
        print()

        # TODO: What happens when there is a group in the predict set but not in the train set?

Fitting group: BAAT GP.
Fitting fold: 0
[0]	validation_0-mlogloss:1.8506
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.566197
Fitting fold: 1
[0]	validation_0-mlogloss:1.85069
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.57527
Fitting fold: 2
[0]	validation_0-mlogloss:1.85092
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.580474
Fitting fold: 3
[0]	validation_0-mlogloss:1.85112
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.58812
Fitting fold: 4
[0]	validation_0-mlogloss:1.85082
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.579933
Fitting group: CROMER KNOLL GP.
Fitting fold: 0
[0]	validation_0-mlogloss:1.95605
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[99]	validation_0-mlogloss:0.445191
Fitting fol

In [None]:
models_probas = pd.concat(models_probas_list, ignore_index=True)

In [None]:
cols_ordered = ['Sandstone', 'Sandstone/Shale', 'Shale', 'Marl', 'Dolomite',
                'Limestone', 'Chalk', 'Coal', 'Anhydrite', 'Halite', 'Basement', 'Tuff',
                'MODEL', 'index', 'WELL', 'DEPTH_MD'
               ]

In [None]:
models_probas = models_probas.loc[:, cols_ordered]

In [None]:
models_probas.fillna(0.0, inplace=True)

# y_true

In [None]:
y_true.head()

0    2
1    2
2    2
3    2
4    2
Name: FORCE_2020_LITHOFACIES_LITHOLOGY, dtype: int64

# Probability mean over models

In [None]:
models_probas_cumsums = models_probas.groupby('index').sum()

models_probas_cumsums = models_probas_cumsums.loc[:, 'Sandstone':'Tuff']

In [None]:
models_len = len(models_probas['MODEL'].unique())

In [None]:
models_probas_mean = models_probas_cumsums / models_len

# Lithology with highest probability per sample

In [None]:
models_probas_mean.columns = [LITHOLOGY_TO_ORDINAL[col] for col in models_probas_mean.columns]

In [None]:
y_pred = models_probas_mean.idxmax(axis=1)

# Score

In [None]:
hidden_test_score = score(y_true, y_pred, penalty_matrix)

In [None]:
print(f'Olawale modified hidden test score is: {hidden_test_score:.4f}')

Olawale modified hidden test score is: -0.5064
