In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('data/task1/dataset.csv', encoding='cp1251')

In [3]:
stops = pd.read_csv('data/task1/stops.csv')

In [4]:
pd.set_option('display.max_colwidth', 200)

In [5]:
df.date = pd.to_datetime(df.date, infer_datetime_format=True)

In [6]:
stops.date = pd.to_datetime(stops.date, dayfirst=True)

In [7]:
df.iloc[:,1:20] = (df.iloc[:,1:20] - df.iloc[:,1:20].mean()) / (df.iloc[:,1:20].max() - df.iloc[:,1:20].min())

In [8]:
df.iloc[:,20:] = (df.iloc[:,20:] - df.iloc[:,20:].mean()) / (df.iloc[:,20:].max() - df.iloc[:,20:].min())

In [9]:
df = df.merge(stops, left_on='date', right_on='date', how='outer')

In [10]:
df['type'].replace(['vibrosito', 'stop'], 1, inplace=True)
df['type'].fillna(method='backfill', limit=360, inplace=True)
df.fillna(0, inplace=True)

In [11]:
df = df.rename(index=str, columns={"type": "target"})

In [12]:
n = df[df.date > pd.Timestamp('2018-01-01 00:00:00')].count()[0]
df.drop(df.index[-n:], inplace=True)

In [13]:
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [14]:
def f1_score(true, pred):
    p = precision(true, pred)
    r = recall(true, pred)
    return 2 * (p * r) / (p + r + 1e-6)

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [32]:
def get_model(input_shape):
    
    X_input = Input(shape = input_shape)
    
    X = Conv1D(196, kernel_size=20, strides=10)(X_input)                           
    X = BatchNormalization()(X)                                
    X = Activation('tanh')(X)                                 
    X = Dropout(0.8)(X)                                 

  #  X = GRU(units = 128, return_sequences = True)(X) 
 #   X = Dropout(0.8)(X)                              
#    X = BatchNormalization()(X)                      
    
    X = LSTM(units = 128, return_sequences = True)(X) 
    X = Dropout(0.8)(X)                              
    X = BatchNormalization()(X)                      
    X = Dropout(0.8)(X)                              
    
    X = TimeDistributed(Dense(1, activation = "sigmoid"))(X) 

    model = Model(inputs = X_input, outputs = X)
    
    return model

In [16]:
sample = df.count()[0]
seq_len = 1000

In [17]:
X_0_train = []
while len(X_0_train) < 500:
    n = random.randint(seq_len, sample*3//4)
    X_0_train.append(df.iloc[n-seq_len:n, 1:].values)

In [18]:
indices_with_ones = df[df['target'] == 1].index.tolist()
indices_with_ones_train = indices_with_ones[:len(indices_with_ones)*3//4]
X_1_train = []
while len(X_1_train) < 500:
    i = int(random.choice(indices_with_ones))
    corr = random.randint(0, seq_len//2)
    X_1_train.append(df.iloc[i+corr-seq_len:i+corr, 1:].values)

In [19]:
X_0_test = []
while len(X_0_test) < 200:
    n = random.randint(sample*3//4, sample)
    X_0_test.append(df.iloc[n-seq_len:n, 1:].values)

In [20]:
indices_with_ones_test = indices_with_ones[len(indices_with_ones)*3//4:]
X_1_test = []
while len(X_1_test) < 200:
    i = int(random.choice(indices_with_ones))
    corr = random.randint(0, seq_len//2)
    X_1_test.append(df.iloc[i+corr-seq_len:i+corr, 1:].values)

In [21]:
X_train = X_0_train + X_1_train
X_test = X_0_test + X_1_test
X_train = np.array(X_train)
X_test = np.array(X_test)
np.random.shuffle(X_train)
np.random.shuffle(X_test)

In [22]:
X = X_train[:,:,:-1]
y = X_train[:,:-10:10,-1:].astype(int)
print(X.shape, y.shape)

(1000, 1000, 43) (1000, 99, 1)


In [33]:
model = get_model(input_shape = (seq_len, 43))

In [34]:
opt = Adam(lr=0.1, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy", f1_score])

In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000, 43)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 99, 196)           168756    
_________________________________________________________________
batch_normalization_1 (Batch (None, 99, 196)           784       
_________________________________________________________________
activation_1 (Activation)    (None, 99, 196)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 99, 196)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 99, 128)           124800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 99, 128)           0         
__________

In [26]:
X_val = X_test[:,:,:-1]
y_val = X_test[:,:-10:10,-1:]
print(X_val.shape, y_val.shape)

(400, 1000, 43) (400, 99, 1)


In [35]:
model.fit(X, y, batch_size = 10, epochs=50, validation_data = (X_val, y_val))

Train on 1000 samples, validate on 400 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f29e01895c0>

In [36]:
y_pred = model.predict(X_val)
y_pred.shape

(400, 99, 1)

In [37]:
y_val = y_val.reshape(400*99).astype(int).tolist()

In [50]:
y_pred = (model.predict(X_val).reshape(400*99) > 0.05)

In [51]:
from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(y_pred, y_val))
print(f1_score(y_val, y_pred))

[[14488    44]
 [18466  6602]]
0.41634609320804694


In [None]:
sum(sum(y_val))