In [276]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib as mptlib
import keras as ks
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

np.random.seed(42) # set the seed for reproducibility


root = "./"
df = pd.read_csv(root + "oil_wells_data.csv")
df.head()

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2018-05-17 22:56:06.000000,27897490.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
1,2018-05-17 22:56:07.000000,27897450.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
2,2018-05-17 22:56:08.000000,27897360.0,19824230.0,125.6859,4059666.0,97.55283,,,,0
3,2018-05-17 22:56:09.000000,27897430.0,19824230.0,125.6859,4059666.0,97.55282,,,,0
4,2018-05-17 22:56:10.000000,27897500.0,19824230.0,125.6859,4059666.0,97.55282,,,,0


In [277]:
#we drop the timestamps and class labels, aswell as the empty columns
clean_df = df.drop({'timestamp','P-JUS-CKGL','T-JUS-CKGL','QGL'}, axis=1)

x = clean_df.drop({'class'}, axis=1)
x.head()

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
0,27897490.0,19824230.0,125.6859,4059666.0,97.55283
1,27897450.0,19824230.0,125.6859,4059666.0,97.55283
2,27897360.0,19824230.0,125.6859,4059666.0,97.55283
3,27897430.0,19824230.0,125.6859,4059666.0,97.55282
4,27897500.0,19824230.0,125.6859,4059666.0,97.55282


In [278]:
#checking for any Nan value
array_sum = np.sum(x)
array_has_nan = np.isnan(x)
for i in array_has_nan:
    if i==True:
        print("True")

In [279]:
#we retrieve the class labels
y = df['class']
#Since we'll be using a window of size 2, the label of each timeserie will be the second one, so we drop the first class label.
y = y.drop(0, axis=0)

# One hot
y = pd.get_dummies(y)
y.head()


Unnamed: 0,0,8,108
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0


In [280]:
#standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_std = x.values
X_train_std = pd.DataFrame(scaler.fit_transform(x_train_std),
                          columns=x.columns, index=x.index)
X_train_std.head()

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP
0,-2.678032,-2.661793,2.493874,2.449044,1.907898
1,-2.678053,-2.661793,2.493874,2.449044,1.907898
2,-2.678101,-2.661793,2.493874,2.449044,1.907898
3,-2.678064,-2.661793,2.493874,2.449044,1.907898
4,-2.678027,-2.661793,2.493874,2.449044,1.907898


In [281]:
y_np = pd.DataFrame(y).to_numpy()

# we apply a sliding window of size 2 and stride 1 on our dataset to generate multiple instances of timeseries.
x_train_timeseries = np.lib.stride_tricks.sliding_window_view(x, window_shape=(2,5))
x_train_timeseries = np.reshape(x_train_timeseries, (x_train_timeseries.shape[0], x_train_timeseries.shape[2], x_train_timeseries.shape[3]))

#Here is the first instance of our timeseries:
print("Our timeseries: %s" % x_train_timeseries[0])
#And its corresponding label:
print("Class label = %s" % y_np[0])

Our timeseries: [[2.789749e+07 1.982423e+07 1.256859e+02 4.059666e+06 9.755283e+01]
 [2.789745e+07 1.982423e+07 1.256859e+02 4.059666e+06 9.755283e+01]]
Class label = [1 0 0]


In [282]:
from sklearn.model_selection import train_test_split

x_train_set, x_test_set = train_test_split(x_train_timeseries, test_size=0.25, random_state=None, shuffle=False, stratify=None)
y_train_set, y_test_set = train_test_split(y_np, test_size=0.25, random_state=None, shuffle=False, stratify=None)

#print(clean_df.corr)