In [52]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib as mptlib
import keras as ks
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import os

np.random.seed(42) # set the seed for reproducibility

In [53]:
root = "./"
df = pd.read_csv(root + "/oil_wells_data.csv")

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2018-05-17 22:56:06.000000,27897490.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
1,2018-05-17 22:56:07.000000,27897450.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
2,2018-05-17 22:56:08.000000,27897360.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
3,2018-05-17 22:56:09.000000,27897430.0,19824230.0,125.6859,4059666.0,97.55282,,,,0
4,2018-05-17 22:56:10.000000,27897500.0,19824230.0,125.6859,4059666.0,97.55282,,,,0


In [54]:
#we drop the timestamps and the empty columns
clean_df = df.drop({'timestamp','P-JUS-CKGL','T-JUS-CKGL','QGL'}, axis=1)

#checking for any Nan value
array_sum = np.sum(clean_df)
array_has_nan = np.isnan(clean_df)
for i in array_has_nan:
    if i==True:
        print("True")

In [55]:
x = clean_df.drop({'class'}, axis = 1)
y = clean_df['class']

#we drop the first 25 lines because each timeserie will have the label of the last observation (25th) and our stride is one
y = y.drop({i for i in range(24)}, axis=0)
#resetting the index
y = y.reset_index()
# to drop the index columns
y = y['class']

# we apply a sliding window of size 2 and stride 1 on our dataset to generate multiple instances of timeseries.
timeseries_x = np.lib.stride_tricks.sliding_window_view(x, window_shape=(25,5))
timeseries_x = np.reshape(timeseries_x, (timeseries_x.shape[0], timeseries_x.shape[2]*5 ))

timeseries_x = pd.DataFrame(timeseries_x)

#we stick them back together:
timeseries = pd.concat([timeseries_x, y], axis=1, join='inner')


In [56]:
from sklearn.model_selection import train_test_split

timeseries_train, timeseries_test = train_test_split(timeseries, test_size=0.25, random_state=42)


In [57]:
#standardization
from sklearn.preprocessing import StandardScaler

def standardize_df(data):

    scaler = StandardScaler()
    #transforming the df to a numpy array
    df = data.values
    #back to pandas dataframe
    df_std = pd.DataFrame(scaler.fit_transform(df),
                            columns=data.columns, index=data.index)
    return df_std


# we drop the class labels
x_train = timeseries_train.drop({'class'}, axis=1)
x_test = timeseries_test.drop({'class'}, axis =1)

x_train = standardize_df(x_train)
x_test = standardize_df(x_test)

x_train = x_train.values
x_train = np.reshape(x_train, (x_train.shape[0], 25, 5 ))
x_test = x_test.values
x_test = np.reshape(x_test, (x_test.shape[0], 25, 5 ) )


In [58]:
y_train = timeseries_train['class']
y_train = pd.get_dummies(y_train)
y_test = timeseries_test['class']
y_test = pd.get_dummies(y_test)


In [59]:
# Implement the convolutional layer

conv = tf.keras.layers.Conv1D(filters=8, kernel_size=2, strides=1,
                              padding='same', activation='relu')

#maxpool = tf.keras.layers.MaxPooling1D(1)
avepool = tf.keras.layers.AveragePooling1D(1)

verbose, epochs, batch_size = 0, 10, 32
n_timesteps, n_features, n_outputs = 25, 5, 3
model = Sequential()
model.add(Conv1D(filters=8, kernel_size=1, strides = 1, padding = 'same', activation='relu', input_shape=(n_timesteps,n_features)))

model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
# evaluate model
_, accuracy = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)


In [None]:
print(accuracy)


0.9552194476127625
