### Customized Data Generator and Iterator for the entire Dataset 

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import importlib 
import csv
from sklearn import metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras import layers
from tensorflow.keras.layers import Conv1D,Conv2D,MaxPooling1D,MaxPooling2D,Flatten,Dense,Dropout,BatchNormalization, GRU, LSTM, RNN
from tensorflow.keras import regularizers as reg

In [2]:
volcano = pd.read_csv('E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/train.csv')
TRAINDIR = "E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/train"
TESTDIR = "E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/test"
print(volcano["segment_id"].values)

[1136037770 1969647810 1895879680 ...  694853998 1886987043 1100632800]


In [3]:
import glob
import re            

files = glob.glob(f"E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/train/*")
print("Total number of files: ", len(files))

Total number of files:  4431


In [4]:
train_set = pd.DataFrame()
train_set['segment_id'] = volcano.segment_id
train_set = train_set.set_index('segment_id')
j = 0
for seg in volcano.segment_id:
    signals = pd.read_csv(f'E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/train/{seg}.csv').replace([np.inf, -np.inf, np.nan], 0)
    j = j+1
    if j > 1: break
print(signals)

       sensor_1  sensor_2  sensor_3  sensor_4  sensor_5  sensor_6  sensor_7  \
0          33.0     964.0    -226.0     536.0     143.0     344.0     -18.0   
1          47.0     302.0    -257.0     494.0     171.0     358.0     -32.0   
2          79.0    -431.0    -341.0     478.0     108.0     179.0     -21.0   
3          98.0   -1069.0    -353.0     491.0     -62.0     123.0     -17.0   
4         133.0   -1569.0    -397.0     499.0    -313.0     133.0     -26.0   
...         ...       ...       ...       ...       ...       ...       ...   
59996     415.0   -1196.0    -161.0     -53.0     -30.0     344.0     181.0   
59997     476.0   -1200.0     -88.0     -35.0     125.0     302.0     187.0   
59998     530.0   -1261.0    -140.0     -19.0     277.0     354.0     204.0   
59999     579.0   -1325.0     -14.0      16.0     407.0     329.0     227.0   
60000     601.0   -1336.0     -20.0      55.0     484.0     292.0     244.0   

       sensor_8  sensor_9  sensor_10  
0          8

In [5]:
signals.isnull().values.any()

False

In [6]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(signals)
signals_normalized = pd.DataFrame(x_scaled)

In [7]:
print(signals_normalized)

              0         1         2         3         4         5         6  \
0      0.485007  0.557795  0.489390  0.660797  0.579398  0.567749  0.589384   
1      0.488339  0.487204  0.481268  0.650526  0.586663  0.571669  0.587047   
2      0.495954  0.409042  0.459261  0.646613  0.570317  0.521557  0.588883   
3      0.500476  0.341011  0.456117  0.649792  0.526207  0.505879  0.589551   
4      0.508805  0.287695  0.444590  0.651749  0.461079  0.508679  0.588049   
...         ...       ...       ...       ...       ...       ...       ...   
59996  0.575916  0.327469  0.506419  0.516752  0.534510  0.567749  0.622601   
59997  0.590433  0.327042  0.525544  0.521154  0.574728  0.555991  0.623602   
59998  0.603284  0.320537  0.511920  0.525067  0.614167  0.570549  0.626440   
59999  0.614945  0.313713  0.544931  0.533627  0.647898  0.563550  0.630279   
60000  0.620181  0.312540  0.543359  0.543165  0.667878  0.553191  0.633116   

              7         8         9  
0      0.5635

In [8]:
file_check = f"E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/train/13787554.csv"
nb = pd.read_csv(file_check).replace([np.inf, -np.inf, np.nan], 0)
nb.isnull().values.any()

False

In [9]:
def data_normalizing(signals):
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(signals)
    signals_normalized = pd.DataFrame(x_scaled)
    return(signals_normalized)

def ip_data(filename):
    filename = sanitize(filename)
    notebook = pd.read_csv(filename).replace([np.inf, -np.inf, np.nan], 0)
    notebook_normalized = data_normalizing(notebook)
    return notebook_normalized

def sanitize(string):
    try:
        string = string.decode()
    except (UnicodeDecodeError, AttributeError):
        pass
    return string

def get_file_name(file_path):
    file_path = sanitize(file_path)
    return file_path.split("\\")[1].split(".")[0]

X_Id = volcano['segment_id']
X_Erupt = volcano['time_to_eruption']

X_dict = dict(zip(X_Id, X_Erupt))

training = pd.DataFrame()
training['segment_id'] = volcano.segment_id
training = training.set_index('segment_id')

def iterator_train(files, batch_size):
    for i in range(len(files)):
        end = i + batch_size
        data = []
        labels = []
        for j in range(i, end):
            current_file = files[j]
            pattern = get_file_name(current_file)
            labels = X_dict[int(pattern)]
            labels = np.asarray(labels).reshape(-1)
            
            data = ip_data(files[j])    
            data = np.asarray(data).reshape(-1,60001,10,1)            

#             print(type(labels))
            
            yield data, labels

iterator_train(files, 5)
print(len(files))
# print(type(labels))

4431


In [10]:
BATCH_SIZE = 5

cnn_model = Sequential()
cnn_model.add(Conv2D(64, 3, input_shape = (60001,10,1), activation = 'relu',
                     kernel_initializer='normal',kernel_regularizer=reg.l2(0.05)))
cnn_model.add(BatchNormalization())
cnn_model.add(MaxPooling2D(pool_size = 2))
cnn_model.add(Conv2D(128, 3, activation = 'relu',kernel_initializer='normal',kernel_regularizer=reg.l2(0.05)))
cnn_model.add(BatchNormalization())
cnn_model.add(MaxPooling2D(pool_size = 2))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation = 'linear'))
cnn_model.compile(optimizer = 'adam', loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

# cnn_model.summary()

steps_per_epoch = np.int(np.ceil(len(files)/2))
print("steps_per_epoch = ", steps_per_epoch)

train_dataset = tf.data.Dataset.from_generator(iterator_train, args = [files, BATCH_SIZE],
                                               output_shapes = ((None,60001,10,1),(None,)),
                                              output_types = (tf.float32, tf.float32))
cnn_model.fit(train_dataset, steps_per_epoch = steps_per_epoch, epochs=5)       

steps_per_epoch =  2216
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1dbc3248b20>

In [11]:
test_sample = pd.read_csv('E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')

def data_normalizing(signals):
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.transform(signals)
    signals_normalized = pd.DataFrame(x_scaled)
    return(signals_normalized)

def ip_data(filename):
    filename = sanitize(filename)
    notebook = pd.read_csv(filename).replace([np.inf, -np.inf, np.nan], 0)
    notebook_normalized = data_normalizing(notebook)
    return notebook_normalized

def sanitize(string):
    try:
        string = string.decode()
    except (UnicodeDecodeError, AttributeError):
        pass
    return string

def iterator_test(files, batch_size):
    for i in range(len(files)):
        end = i + batch_size
        data = []
        labels = []
        for j in range(i, end):
            current_file = files[j]
            
            data = ip_data(files[j])    
            data = np.asarray(data).reshape(-1,60001,10,1)           
            
            yield data

In [12]:
files_test = glob.glob(f"E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/test/*")

BATCH_SIZE = 5
steps_per_epocht = np.int(np.ceil(len(files_test)))

prediction_dataset = tf.data.Dataset.from_generator(iterator_test, args = [files_test, BATCH_SIZE],
                               output_shapes=(None,60001,10,1), output_types=(tf.float32))

predictions = cnn_model.predict(prediction_dataset, steps = steps_per_epocht)
print(len(predictions))

4520


In [13]:
print(predictions)

[[1.0367773e+08]
 [1.0049795e+08]
 [1.8562946e+07]
 ...
 [2.4311878e+07]
 [1.3868736e+07]
 [1.1725846e+07]]


In [16]:
test_set = pd.read_csv(f'E:/code/ml/datasets/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')

submision = pd.DataFrame()
submision['segment_id'] = test_set.segment_id
submision['time_to_eruption'] = predictions
submision.to_csv('submit_CNN_Bby2.csv', header=True, index=False)