In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Dataset: Data are MC generated to simulate registration of high energy gamma particles in an atmospheric Cherenkov telescope

In [None]:
cols = ['fLength', 'fWidth', 'fSize', 'fCon', 'fCon1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
df = pd.read_csv('datasets/magic04.data', names=cols)
df.head()

In [None]:
df['class'].unique()

In [None]:
df['class'] = (df['class'] == 'g').astype(int)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
for label in df.columns[:-1]:
    plt.hist(df[df['class'] == 1][label], color='blue', label='gamma', alpha=0.7, density=True)
    plt.hist(df[df['class'] == 0][label], color='red', label='hadron', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel('probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()

Creating train, validation and test dataset

In [None]:
train, validation, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

Creating function for feature scaling and sampling

In [None]:
def scale_dataset(dataframe, oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(x)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))

    return data, X, y

In [None]:
train, x_train, y_train = scale_dataset(train, oversample=True)
validation, x_validation, y_validation = scale_dataset(validation, oversample=False)
test, x_test, y_test = scale_dataset(test, oversample=False)

In [None]:
sum(y_train == 1)

In [None]:
sum(x_train == 0)

# Prediction using K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(x_train, y_train)

In [None]:
y_predict = knn_model.predict(x_test)

In [None]:
y_predict

In [None]:
print(classification_report(y_test, y_predict))

# Prediction using Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(x_train, y_train)

In [None]:
y_predict = naive_bayes_model.predict(x_test)

In [None]:
y_predict

In [None]:
print(classification_report(y_test, y_predict))

# Prediction using Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_r_model = LogisticRegression()
logistic_r_model.fit(x_train, y_train)

In [None]:
y_predict = logistic_r_model.predict(x_test)

In [None]:
y_predict

In [None]:
print(classification_report(y_test, y_predict))

# Prediction using Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [None]:
y_predict = svm_model.predict(x_test)

In [None]:
y_predict

In [None]:
print(classification_report(y_test, y_predict))

# Prediction using Neural Network with tensorflow

In [None]:
import tensorflow as tf

In [None]:
def train_model(x_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation='relu'),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    history = nn_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=False)
    return nn_model, history

In [None]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))

    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary crossentropy')
    ax1.grid(True)
    
    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.grid(True)

    plt.show()

In [None]:
epochs = 100
least_validation_loss = float('inf')
least_loss_model = None

for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for learning_rate in [0.01, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f'nodes {num_nodes}, dropout probability {dropout_prob}, learning rate {learning_rate}, batch size {batch_size}')
                model, history = train_model(x_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs)
                plot_history(history)
                
                validation_loss = model.evaluate(x_validation, y_validation)[0]
                if validation_loss < least_validation_loss:
                    least_validation_loss = validation_loss
                    least_loss_model = model

In [None]:
y_predict = least_loss_model.predict(x_test)

In [None]:
y_predict = (y_predict > 0.5).astype(int).reshape(-1,)

In [None]:
y_predict

In [None]:
print(classification_report(y_test, y_predict))