In [1]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sn
import os
os.environ['PYTHONHASHSEED']=str(42)
%matplotlib inline
import uuid
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, top_k_accuracy_score, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

def reset_random_seeds():
   os.environ['PYTHONHASHSEED']=str(42)
   tf.random.set_seed(42)
   np.random.seed(42)
   random.seed(42)

2022-04-17 14:39:50.490432: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-17 14:39:50.490575: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
DATASET_PARENT_FOLDER = "../Datasets/Higgs/chunked_higgs/"

In [3]:
all_sets_id = [1,2,3,4,5,6,7,8,9,10,11]
train_set_id = [2,3,4,5,6,7,8,9,10,11] # Includes the valid set already
test_set_id = [1]

In [4]:
# Description of train sets

# for i in train_set_id:
#     higgs_train = pd.read_csv(os.path.join(DATASET_PARENT_FOLDER, "output_{}.csv".format(i)))
#     description = higgs_train.describe()
#     print("---------------output_{}.csv---------------".format(i))
#     print(description)

In [5]:
# Number of Higgs and Non-Higgs train sets

# for i in all_sets_id:
#     higgs_train = pd.read_csv(os.path.join(DATASET_PARENT_FOLDER, "output_{}.csv".format(i)))
#     result = higgs_train['1.000000000000000000e+00'].value_counts()
#     print("---------------output_{}.csv---------------".format(i))
#     print(result)

In [6]:
# !pip install tensorflow

In [7]:
# Model

# 10-L Deep-NN
def deepNN_model_creator(input_shape):
    model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
        
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )
    return model

def shallowNN_model_creator(input_shape):
    model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(256, activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )
    return model
    

In [15]:
# for shallow NN, use name 'shallowNN'
# for deep NN, use name 'deepNN'

reset_random_seeds()
def model_trainer(model_to_fit, X_train, y_train, epochs, batch_size, validation_data, model_custom_name):
    
    """
        model_to_fit: model to be trained
        epochs: no. of epochs
        batch_size: no. of batch size
        validation_data: (X_valid, y_valid)
        model_custom_name: create your own name
    """
    
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath="./{}".format(model_custom_name), 
        monitor="val_loss",
        verbose=1, 
        save_best_only=True,
        mode="min"
    )
    
    model_to_fit = model_to_fit
    history = model_to_fit.fit(
        X_train=X_train, y_train=y_train,
        validation_data=validation_data,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[checkpoint],
    )
    
    history_df = pd.DataFrame(history.history)
    history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
    history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [16]:
# Data Splitting and Training
from tensorflow.keras.models import load_model

reset_random_seeds()
def train_iterator(model_custom_name):
    for i in train_set_id:
        print("----------Dataset: output_{}.csv----------".format(i))
        higgs_train = pd.read_csv(os.path.join(DATASET_PARENT_FOLDER, "output_{}.csv".format(i)))
        X = higgs_train.copy()
        y = X.pop("1.000000000000000000e+00")

        scaler_folder = "./scaler"
        if not os.path.exists(scaler_folder):
            os.mkdir(scaler_folder)
            
        scaler_filename = "higgs_scaler.sav"
        scaler_path = os.path.join(scaler_folder,scaler_filename)
        if os.path.exists(scaler_path):
            os.remove(scaler_path) # Delete to replace the existing
       
        scaler = MinMaxScaler()
        scaler.fit(X)
        joblib.dump(scaler,scaler_path)
        
        scaler = joblib.load(open(scaler_path, "rb"))
        X = scaler.transform(X)
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=100000/1000000, random_state=42)
        validation_data = (X_valid, y_valid)
        input_shape = [X_train.shape[1]]
        print(input_shape)

        if os.path.exists("./{}".format(model_custom_name)):
            model_to_fit = load_model("./{}".format(model_custom_name))
            print("model loaded")
        else:
            model_to_fit = deepNN_model_creator(input_shape)
            print("model initialized")
        model_trainer(model_to_fit=model_to_fit, X_train=X_train, y_train=y_train,epochs=100,batch_size=512, validation_data=validation_data, model_custom_name=model_custom_name)

In [17]:
reset_random_seeds()
train_iterator(model_custom_name="deepNN_model")

----------Dataset: output_2.csv----------
[28]
model initialized


TypeError: fit() got an unexpected keyword argument 'X_train'

In [None]:
reset_random_seeds()
def predictor(model_path, test_path, scaler_path):
    higgs_test = pd.read_csv(test_path)
    X_test = higgs_test.copy()
    y_test = X_test.pop("1.000000000000000000e+00")
    
    scaler = joblib.load(open(scaler_path, "rb"))
    X_test = scaler.transform(X_test)
    
    model = load_model(model_path)
    
    prediction_start = time.perf_counter()
    predictions = model.predict(X_test)
    prediction_end = time.perf_counter()
    print("Prediction Time :", prediction_end - prediction_start, " seconds")
    
    predictions = [0 if x > 0.5 else 1 for x in predictions]
    
    f1_score_model = f1_score(predictions, y_test, average="binary")
    accuracy_score_model = accuracy_score(predictions, y_test)
    precision_score_model = precision_score(predictions, y_test)
    recall_score_model = recall_score(predictions, y_test)
    roc_auc_score_model = roc_auc_score(predictions, y_test)
    roc_curve_model = roc_curve(predictions, y_test)
    top_k_accuracy_score_model = top_k_accuracy_score(predictions, y_test)
    classification_report_model = classification_report(predictions, y_test)
    
    print("f1_score: ", f1_score_model)
    print("accuracy_score: ", accuracy_score_model)
    print("precision_score: ", precision_score_model)
    print("recall_score: ", recall_score_model)
    print("roc_auc_score: ", roc_auc_score_model)
    print("roc_curve: ", roc_curve_model)
    print("top_k_accuracy_score: ", top_k_accuracy_score_model)
    print("-------------classification_report-------------- ")
    print(classification_report_model)
    
    # Confusion Matrix
    confusion_mtx = confusion_matrix(y_test, predictions)

    ax = plt.axes()
    sn.heatmap(confusion_mtx, annot=True,annot_kws={"size": 25}, cmap="Blues", ax = ax)
    ax.set_title('{}'.format(model_path), size=12)
    ax.xaxis.set_ticklabels(['Non-Higgs', 'Higgs']); ax.yaxis.set_ticklabels(['Non-Higgs', 'Higgs']);
    ax.set_xlabel('Reality');ax.set_ylabel('Prediction');
    plt.show()

In [None]:
reset_random_seeds()
test_path = os.path.join(DATASET_PARENT_FOLDER, "output_1.csv")
model_path = "./deepNN_model"
scalar_path = "./scaler/higgs_scaler.sav"
predictor(model_path, test_path, scalar_path)

In [None]:
ls