In [1]:
# This is an example of the creation of the model on synthetic data

In [2]:
# packages used
import os
import pandas as pd
from glob import glob
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

In [3]:
# the functions used in feature extraction
def rms(data):
    return np.sqrt(np.mean(np.square(data)))


def sra(data):
    return np.square(np.mean(np.sqrt(np.abs(data))))


def kv(data):
    return np.mean(np.power((data - np.mean(data))/np.std(data), 4))


def sv(data):
    return np.mean(np.power((data - np.mean(data)) / np.std(data), 3))


def p2pv(data):
    return np.max(data) - np.min(data)


def cf(data):
    return np.max(np.abs(data))/np.sqrt(np.mean(np.square(data)))


def imf(data):
    return np.max(np.abs(data))/np.mean(np.abs(data))


def mf(data):
    return np.max(np.abs(data))/np.square(np.mean(np.sqrt(np.abs(data))))


def sf(data):
    return np.max(np.abs(data))/np.sqrt(np.mean(np.square(data)))


def kf(data):
    return np.mean(np.power((data - np.mean(data)/np.std(data)), 4))/np.square(np.mean(np.square(data)))


def dfa(data):
    return nolds.dfa(data)


def npeaks(data, threshold):
    return len(detect_peaks(data, threshold=threshold))


def fc(data):
    fft = np.fft.fft(data)
    return np.mean(np.abs(fft[1:len(fft) // 2]))


def rmsf(data):
    fft = np.fft.fft(data)
    return np.sqrt(np.mean(np.abs(fft[1:len(fft) // 2])))


def rvf(data):
    fft = np.fft.fft(data)
    return np.sqrt(np.mean(np.square(np.abs(fft[1:len(fft) // 2]) - fc(data))))


def detect_peaks(x, mph=None, mpd=1, threshold=0, edge='rising',
                 kpsh=False, valley=False, ax=None):
    x = np.atleast_1d(x).astype('float64')
    if x.size < 3:
        return np.array([], dtype=int)
    if valley:
        x = -x
        if mph is not None:
            mph = -mph
    # find indices of all peaks
    dx = x[1:] - x[:-1]
    # handle NaN's
    indnan = np.where(np.isnan(x))[0]
    if indnan.size:
        x[indnan] = np.inf
        dx[np.where(np.isnan(dx))[0]] = np.inf
    ine, ire, ife = np.array([[], [], []], dtype=int)
    if not edge:
        ine = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) > 0))[0]
    else:
        if edge.lower() in ['rising', 'both']:
            ire = np.where((np.hstack((dx, 0)) <= 0) & (np.hstack((0, dx)) > 0))[0]
        if edge.lower() in ['falling', 'both']:
            ife = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) >= 0))[0]
    ind = np.unique(np.hstack((ine, ire, ife)))
    # handle NaN's
    if ind.size and indnan.size:
        # NaN's and values close to NaN's cannot be peaks
        ind = ind[np.in1d(ind, np.unique(np.hstack((indnan, indnan - 1, indnan + 1))), invert=True)]
    # first and last values of x cannot be peaks
    if ind.size and ind[0] == 0:
        ind = ind[1:]
    if ind.size and ind[-1] == x.size - 1:
        ind = ind[:-1]
    # remove peaks < minimum peak height
    if ind.size and mph is not None:
        ind = ind[x[ind] >= mph]
    # remove peaks - neighbors < threshold
    if ind.size and threshold > 0:
        dx = np.min(np.vstack([x[ind] - x[ind - 1], x[ind] - x[ind + 1]]), axis=0)
        ind = np.delete(ind, np.where(dx < threshold)[0])
    # detect small peaks closer than minimum peak distance
    if ind.size and mpd > 1:
        ind = ind[np.argsort(x[ind])][::-1]  # sort ind by peak height
        idel = np.zeros(ind.size, dtype=bool)
        for i in range(ind.size):
            if not idel[i]:
                # keep peaks with the same height if kpsh is True
                idel = idel | (ind >= ind[i] - mpd) & (ind <= ind[i] + mpd) \
                       & (x[ind[i]] > x[ind] if kpsh else True)
                idel[i] = 0  # Keep current peak
        # remove the small peaks and sort back the indices by their occurrence
        ind = np.sort(ind[~idel])
    return ind

In [5]:
x = []  # the lsit that will contain the samples
y = []  # the lsit that will contain the labels
rates = []  # the lsit that will contain the rates

EXT = "*.csv"

PATH = r'path_to_the_folder\synthetic_data\label_1'
sand_csv_files = [file for path, subdir, files in os.walk(PATH) for file in glob(os.path.join(path, EXT))]

for filename in sand_csv_files:
    df = pd.read_csv(filename)
    x.append(df)
    y.append(1)

PATH = r'path_to_the_folder\synthetic_data\label_0'
nosand_csv_files = [file for path, subdir, files in os.walk(PATH) for file in glob(os.path.join(path, EXT))]

for filename in nosand_csv_files:
    df = pd.read_csv(filename)
    x.append(df)
    y.append(0)


main_df = pd.DataFrame(index=range(0, len(x)),
                       columns=['ac_mean', 'ac_std', 'ac_rms', 'ac_sra', 'ac_p2pv', 'ac_n_peaks', 'ac_fc', 'ac_rmsf', 'ac_rvf',
                                'chk_min', 'chk_std', 'chk_del_sum', 'chk_del_abs_sum', 'alloc_liq', 'alloc_gas'])

T = 9  # sample rate
N = 100 # length of signals
fs = np.linspace(0, 1 / T, N) # frequencies for FFT

# extracting features from all samples
for i in range(len(x)):
    df = x[i].copy()
    main_df.loc[i, 'ac_mean'] = df['ac'].mean()
    main_df.loc[i, 'ac_std'] = np.std(df['ac'])
    main_df.loc[i, 'ac_rms'] = rms(df['ac'].values)
    main_df.loc[i, 'ac_sra'] = sra(df['ac'].values)
    main_df.loc[i, 'ac_p2pv'] = p2pv(df['ac'].values)
    main_df.loc[i, 'ac_n_peaks'] = npeaks(df['ac'].values, threshold=0)
    main_df.loc[i, 'ac_fc'] = fc(df['ac'].values)
    main_df.loc[i, 'ac_rmsf'] = rmsf(df['ac'].values)
    main_df.loc[i, 'ac_rvf'] = rvf(df['ac'].values)

    main_df.loc[i, 'chk_min'] = np.min(df['chk'].values)
    main_df.loc[i, 'chk_std'] = np.std(df['chk'].values)
    main_df.loc[i, 'chk_del_sum'] = df['chk'].diff().fillna(0).sum()
    main_df.loc[i, 'chk_del_abs_sum'] = df['chk'].diff().fillna(0).abs().sum()

In [26]:
rates_0 = pd.read_csv(r'path_to_the_folder\synthetic_data/rates_0.csv')
rates_1 = pd.read_csv(r'path_to_the_folder\synthetic_data/rates_1.csv')

oil_0 = rates_0['oil'].values
oil_1 = rates_1['oil'].values
oil = np.append(oil_1, oil_0)

water_0 = rates_0['water'].values
water_1 = rates_1['water'].values
water = np.append(water_1, water_0)

gas_0 = rates_0['gas'].values
gas_1 = rates_1['gas'].values
gas = np.append(gas_1, gas_0)

main_df['alloc_liq'] = oil + water
main_df['alloc_gas'] = gas

In [27]:
main_df = main_df.replace([np.inf, -np.inf], 0)
main_df = main_df.fillna(0)
X_train = main_df

y = np.array(y, dtype='int')
# y_train = y.astype('int')

In [29]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight="balanced")
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [31]:
os.chdir(r'path_to_a_directory')
filename = 'RFC_v3.sav'
pickle.dump(rfc, open(filename, 'wb'), protocol=2)