In [1]:
import numpy as np
import pandas as pd

from src.features.build_features import replace_nan_inf, shift_concat, gradient, shift_concat_gradient

In [2]:
%load_ext autoreload
%autoreload 2

# Bestagini's feature augmentation

The following functions come from [Paulo Bestagini's feature augmentation technique from SEG 2016 ML competition](https://github.com/seg/2016-ml-contest/tree/master/ispl). They were desing to work with numpy arrays but I adapted them to work with pandas. Here I compare the output of both approaches.

## Augment features window: shift_concat
This function shifts the logs down and up a given number of idexes, `N_neig` in the numpy version, `periods` in the pandas version.
Ideally, this function should only be applied to the depth dependent data (i.e., the logs) and not the well-level data (i.e. well name).

In [3]:
def augment_features_window(X, N_neig):
    # Parameters
    N_row = X.shape[0]
    N_feat = X.shape[1]
 
    # Zero padding
    X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))
 
    # Loop over windows
    X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))
    for r in np.arange(N_row)+N_neig:
        this_row = []
        for c in np.arange(-N_neig,N_neig+1):
            this_row = np.hstack((this_row, X[r+c]))
        X_aug[r-N_neig] = this_row
 
    return X_aug

In [4]:
test_df = pd.DataFrame({'gr': [1.1, 2.1], 'den': [2.1, 2.2]})
test_df

Unnamed: 0,gr,den
0,1.1,2.1
1,2.1,2.2


### Shift one period

In [5]:
augment_features_window(test_df.values, N_neig=1)

array([[0. , 0. , 1.1, 2.1, 2.1, 2.2],
       [1.1, 2.1, 2.1, 2.2, 0. , 0. ]])

In [6]:
shift_concat(test_df)

Unnamed: 0,gr_shifted_1,den_shifted_1,gr,den,gr_shifted_-1,den_shifted_-1
0,,,1.1,2.1,2.1,2.2
1,1.1,2.1,2.1,2.2,,


### Shift two periods

In [7]:
augment_features_window(test_df.values, N_neig=2)

array([[0. , 0. , 0. , 0. , 1.1, 2.1, 2.1, 2.2, 0. , 0. ],
       [0. , 0. , 1.1, 2.1, 2.1, 2.2, 0. , 0. , 0. , 0. ]])

In [8]:
shift_concat(test_df, periods=2)

Unnamed: 0,gr_shifted_2,den_shifted_2,gr_shifted_1,den_shifted_1,gr,den,gr_shifted_-1,den_shifted_-1,gr_shifted_-2,den_shifted_-2
0,,,,,1.1,2.1,2.1,2.2,,
1,,,1.1,2.1,2.1,2.2,,,,


## Augment features gradient:

I updated this function to provide the gradient in the next row, leaving the first row with NaN's, to be consistent with pandas.Dataframe.diff. In the numpy version, the gradient is calculated taking the next value minus the current value (look-ahead), and the missing values introduced by the function (at the tail of the array) are filled with zeros. In the pandas version, I decided to fill the missing values with numpy.nan instead.

This function should only be applied to numeric columns, except for the reference depth, i.e., `DEPTH_MD`.

In [9]:
def augment_features_gradient(X, depth):
    # Compute features gradient
    d_diff = np.diff(depth).reshape((-1, 1))
    d_diff[d_diff==0] = 0.001
    X_diff = np.diff(X, axis=0)
    X_grad = X_diff / d_diff
        
    # Compensate for last missing value
    X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))
    
    return X_grad

In [10]:
test_df = pd.DataFrame({'gr': [100.1, 100.2, 100.3], 'den': [2.1, 2.2, 2.3], 'md': [500, 500.5, 501]})
test_df

Unnamed: 0,gr,den,md
0,100.1,2.1,500.0
1,100.2,2.2,500.5
2,100.3,2.3,501.0


In [11]:
depth_col = 'md'

X = test_df.drop(depth_col, axis=1).values

depth = test_df[depth_col].values

augment_features_gradient(X, depth)

array([[0.2, 0.2],
       [0.2, 0.2],
       [0. , 0. ]])

In [12]:
gradient(test_df, 'md')

Unnamed: 0,gr_gradient,den_gradient
0,,
1,0.2,0.2
2,0.2,0.2


## Augment features

In [13]:
def augment_features(X, well, depth, N_neig=1):
    # Augment features
    X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))
    for w in np.unique(well):
        w_idx = np.where(well == w)[0]
        X_aug_win = augment_features_window(X[w_idx, :], N_neig)
        X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])
        X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)
    
    # Find padded rows
    padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])
    
    return X_aug, padded_rows

In [14]:
test_df = pd.DataFrame({'gr': [100.1, 100.2, 100.3, 20.1, 20.2, 20.3],
                        'den': [2.1, 2.2, 2.3, 1.7, 1.8, 1.9],
                        'md': [500, 500.5, 501, 1000, 1000.05, 1001],
                        'well': [1, 1, 1, 2, 2, 2]})
test_df

Unnamed: 0,gr,den,md,well
0,100.1,2.1,500.0,1
1,100.2,2.2,500.5,1
2,100.3,2.3,501.0,1
3,20.1,1.7,1000.0,2
4,20.2,1.8,1000.05,2
5,20.3,1.9,1001.0,2


In [15]:
depth_col = 'md'
well_col = 'well'

X = test_df.drop([depth_col, well_col], axis=1).values

depth = test_df[depth_col].values
well = test_df[well_col].values

X_aug, padded_rows = augment_features(X, well, depth)

In [16]:
pd.DataFrame(X_aug)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,100.1,2.1,100.2,2.2,0.2,0.2
1,100.1,2.1,100.2,2.2,100.3,2.3,0.2,0.2
2,100.2,2.2,100.3,2.3,0.0,0.0,0.0,0.0
3,0.0,0.0,20.1,1.7,20.2,1.8,2.0,2.0
4,20.1,1.7,20.2,1.8,20.3,1.9,0.105263,0.105263
5,20.2,1.8,20.3,1.9,0.0,0.0,0.0,0.0


In [17]:
padded_rows

array([0, 2, 3, 5])

Bestagini's drops these padded rows from the train data on his notebook. On the other hand, Olawale doesn't filter the augmented numpy array (`X_aug`) using `padded_rows`. In this case, I agree with the latter approach, with the difference of leaving the missing values as `numpy.nan`. As a result, the pandas version of this function doesn't need to output the padded rows.

In [18]:
shift_concat_gradient(test_df, depth_col, well_col, periods=1, fill_value=None)

Unnamed: 0,gr_shifted_1,den_shifted_1,gr,den,gr_shifted_-1,den_shifted_-1,well,md,gr_gradient,den_gradient
0,,,100.1,2.1,100.2,2.2,1,500.0,,
1,100.1,2.1,100.2,2.2,100.3,2.3,1,500.5,0.2,0.2
2,100.2,2.2,100.3,2.3,,,1,501.0,0.2,0.2
3,,,20.1,1.7,20.2,1.8,2,1000.0,,
4,20.1,1.7,20.2,1.8,20.3,1.9,2,1000.05,2.0,2.0
5,20.2,1.8,20.3,1.9,,,2,1001.0,0.105263,0.105263
