In [1]:
import os
import tempfile

#numpy
import numpy as np
# pandas
from pandas import read_csv
# matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
#teras
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import AUC, Accuracy, Precision, Recall
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint


# sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
seed = 11
np.random.seed(seed)


In [3]:
# load dataset
dataset = input('\nWhich is the name of the dataset? ')
dataframe = read_csv(dataset, header=0)
dataset = dataframe.values
#dataset = np.random.shuffle(dataset)
first_col = input("\nSelect first column: \n") #2
last_col = input('\nSelect last col: \n')          #15

X = dataset[:,int(first_col):int(last_col)].astype(float)   # columns from 3rd to 14th into X
Y = dataset[:,int(last_col)] #label column (15th) into Y 


Which is the name of the dataset? analysis.csv

Select first column: 
2

Select last col: 
15


In [4]:
#Encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

#One-hot encoding
transformed_Y = to_categorical(encoded_Y)

In [5]:
bkg, sgn = np.bincount(encoded_Y)

total =  bkg + sgn
print('Samples:\n Total: {}\n Background: {} \n Signal: {} \n Signal samples are {:.2f}% of the total'.format(total, bkg, sgn, 100*sgn/total))

Samples:
 Total: 3800033
 Background: 3713773 
 Signal: 86260 
 Signal samples are 2.27% of the total


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    transformed_Y,
                                                    test_size=0.25,
                                                    random_state=seed,
                                                    shuffle = True)

scaler = StandardScaler()
X_train =scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [7]:
print('Training labels shape:', Y_train.shape)
print('Validation labels shape:', Y_test.shape)


print('Training features shape:', X_train.shape)
print('Validation features shape:', X_test.shape)

Training labels shape: (2850024, 2)
Validation labels shape: (950009, 2)
Training features shape: (2850024, 13)
Validation features shape: (950009, 13)


In [8]:
METRICS = [
    AUC(name = 'AUC'),
    Accuracy(name = 'accuracy'),
    Precision(name = 'precision'),
    Recall(name = 'recall')    
]


def make_model(metrics = METRICS, output_bias = None):   
    
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)    
    #Initialising NN
    model = Sequential()

    #First layer
    model.add(Dense(8, activation='relu', input_shape=(13,)))
    #model.add(BatchNormalization())
    model.add(Dropout(0.5))

    #Second layer
    model.add(Dense(12, activation='relu'))
   # model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(2, activation='sigmoid'))


    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(lr= 0.0001),
                  metrics=metrics
                 )
    return model

In [40]:
EPOCHS = 1000
BATCH_SIZE = 150000
val_data = (X_test,Y_test)
#checkpoint = ModelCheckpoint("model_weights.h5", monitor='val_precision', verbose=1, save_best_only=True, mode='max')
#callbacks_list = [checkpoint]
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_precision',
    verbose = 1,
    patience = 70,
    mode = 'max',
    restore_best_weights = True)



In [23]:
model = make_model()
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 8)                 112       
_________________________________________________________________
dropout_8 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_13 (Dense)             (None, 12)                108       
_________________________________________________________________
dropout_9 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 26        
Total params: 246
Trainable params: 246
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.predict(X_train[:10])

array([[0.5548207 , 0.5408166 ],
       [0.5718665 , 0.51383585],
       [0.55785865, 0.47869998],
       [0.5706248 , 0.5012827 ],
       [0.559022  , 0.48646492],
       [0.55066365, 0.5225628 ],
       [0.54891723, 0.5088631 ],
       [0.5472307 , 0.49558932],
       [0.5755338 , 0.52413446],
       [0.60277104, 0.516567  ]], dtype=float32)

In [18]:
result = model.evaluate(X_test, Y_test, batch_size = BATCH_SIZE, verbose = 0)
print("Loss: {:0.4f}".format(result[0]))

Loss: 1.0483


In [24]:
initial_bias = np.log([sgn/bkg])
initial_bias

array([-3.76243763])

In [17]:
model = make_model(output_bias = initial_bias)
model.predict(X_train[:10])

array([[0.45471734, 0.78484946],
       [0.4675188 , 0.56701595],
       [0.5611703 , 0.6682617 ],
       [0.48144564, 0.6046998 ],
       [0.56791794, 0.6453167 ],
       [0.48240045, 0.6196639 ],
       [0.3844213 , 0.5881004 ],
       [0.4983701 , 0.6339977 ],
       [0.42116925, 0.54520005],
       [0.45814532, 0.56384915]], dtype=float32)

In [25]:
initial_weights = os.path.join(tempfile.mkdtemp(),'initial_weights')
model.save_weights(initial_weights)

In [26]:
model = make_model()
model.load_weights(initial_weights)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7816961fd0>

In [27]:
weight_for_0 = (1 / bkg)*(total)/2.0
weight_for_1 = (1 / sgn)*(total)/2.0


class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.51
Weight for class 1: 22.03


In [None]:
%%time
history=model.fit(X_train,
                  Y_train,
                  epochs = EPOCHS,
                  shuffle = True,
                  validation_data=val_data,
                  callbacks = [early_stopping],
                  batch_size = BATCH_SIZE,
                  class_weight=class_weight
                 )

Train on 2850024 samples, validate on 950009 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000


Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000


Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000

In [None]:
#model_json = model.to_json()
#with open("model.json", "w") as json_file:
    #json_file.write(model_json)

In [None]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
def plot_metrics(history):
    metrics =  ['loss', 'AUC', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()

In [None]:
plot_metrics(history)

In [None]:

#compute predictions
predictions = model.predict(X_test)
print(predictions)
y_pred = np.array([np.argmax(probas) for probas in predictions])
y_test = np.array([np.argmax(label) for label in Y_test])




In [None]:

#confusion matrix
cm = confusion_matrix(y_test, y_pred, labels = [1, 0])
cm


In [None]:

classes = unique_labels(y_test, y_pred)
class_names = unique_labels(Y)

In [38]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          class_names = None,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    if class_names is None:
        x_labels = y_labels = classes
    else:
        x_labels = y_labels = class_names
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=x_labels, yticklabels=y_labels,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
plot_confusion_matrix(y_test, y_pred, classes, class_names)

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf()  #Clears the figure
prec = history.history['precision']
val_prec = history.history['val_precision']
plt.plot(epochs, prec, 'bo', label='Training precision')
plt.plot(epochs, val_prec, 'r', label='Validation precision')
plt.title('Training and validation precision')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf()  #Clears the figure
rec = history.history['recall']
val_rec = history.history['val_recall']
plt.plot(epochs, rec, 'bo', label='Training recall')
plt.plot(epochs, val_rec, 'r', label='Validation recall')
plt.title('Training and validation recall')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()