In [1]:
import pandas as pd
import numpy as np
# Load the data from the text file, ignoring the first line

filename="./data/MiniBooNE_PID.txt"
data=pd.read_csv(filename,delim_whitespace=True,skiprows=1,header=None)
print(data.head())

        0         1         2         3         4         5         6   \
0  2.59413  0.468803   20.6916  0.322648  0.009682  0.374393  0.803479   
1  3.86388  0.645781   18.1375  0.233529  0.030733  0.361239  1.069740   
2  3.38584  1.197140   36.0807  0.200866  0.017341  0.260841  1.108950   
3  4.28524  0.510155  674.2010  0.281923  0.009174  0.000000  0.998822   
4  5.93662  0.832993   59.8796  0.232853  0.025066  0.233556  1.370040   

         7        8         9   ...       40       41        42       43  \
0  0.896592  3.59665  0.249282  ...  101.174 -31.3730  0.442259  5.86453   
1  0.878714  3.59243  0.200793  ...  186.516  45.9597 -0.478507  6.11126   
2  0.884405  3.43159  0.177167  ...  129.931 -11.5608 -0.297008  8.27204   
3  0.823390  3.16382  0.171678  ...  163.978 -18.4586  0.453886  2.48112   
4  0.787424  3.66546  0.174862  ...  229.555  42.9600 -0.975752  2.66109   

         44        45        46        47        48        49  
0  0.000000  0.090519  0.176909  0

In [2]:
# rename columns to PID_0 - PID_50
print(data.keys())
data.columns = ["PID_"+str(i) for i in range(1, data.shape[1]+1)]  # type: ignore
print(data.keys())

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
           dtype='int64')
Index(['PID_1', 'PID_2', 'PID_3', 'PID_4', 'PID_5', 'PID_6', 'PID_7', 'PID_8',
       'PID_9', 'PID_10', 'PID_11', 'PID_12', 'PID_13', 'PID_14', 'PID_15',
       'PID_16', 'PID_17', 'PID_18', 'PID_19', 'PID_20', 'PID_21', 'PID_22',
       'PID_23', 'PID_24', 'PID_25', 'PID_26', 'PID_27', 'PID_28', 'PID_29',
       'PID_30', 'PID_31', 'PID_32', 'PID_33', 'PID_34', 'PID_35', 'PID_36',
       'PID_37', 'PID_38', 'PID_39', 'PID_40', 'PID_41', 'PID_42', 'PID_43',
       'PID_44', 'PID_45', 'PID_46', 'PID_47', 'PID_48', 'PID_49', 'PID_50'],
      dtype='object')


In [3]:
data.shape

(130064, 50)

In [4]:
X=data

In [5]:
""" Create target vector for each row of the dataset. 
The first value in the first row contains the number of signal events, the second the number of background events. 
The target vector should contain 1 for signal events and 0 for background events."""

# Get number of signal and background events
signal, background = open(filename).readline().split()

y = np.concatenate((np.ones(int(signal)), np.zeros(int(background))))
print(y)

[1. 1. 1. ... 0. 0. 0.]


Standardization scales the input features to have a mean of 0 and a standard deviation of 1

In [6]:
from sklearn import preprocessing

# Standardize the input data
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)



In [7]:
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a vanilla neural network classifier and train it on the training data
nn_clf = MLPClassifier(random_state=42)
nn_clf.fit(X_train, y_train)

# Define the hyperparameter space to search over for the vanilla neural network classifier
nn_param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (100, 50, 25)],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [200, 300, 400]
}

# Set up the grid search using 5-fold cross-validation
nn_grid_search = GridSearchCV(nn_clf, nn_param_grid, cv=5, scoring='accuracy')

# Perform the grid search on the training set
nn_grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the best cross-validation accuracy
print('Vanilla Neural Network best hyperparameters:', nn_grid_search.best_params_)
print('Vanilla Neural Network best cross-validation accuracy:', nn_grid_search.best_score_)

# Re-train the classifier using the best hyperparameters on the full training set
nn_clf = nn_grid_search.best_estimator_

nn_start_time = time.time()
nn_clf.fit(X_train, y_train)
nn_end_time = time.time()

# Evaluate the classifier on the test set
nn_test_accuracy = nn_clf.score(X_test, y_test)
print('Time required to train Vanilla Neural Network classifier:', nn_end_time - nn_start_time)
print('Vanilla Neural Network test accuracy:', nn_test_accuracy)



In [None]:
# Create a SVM classifier and train it on the training data
svm_clf = SVC(random_state=42)
svm_clf.fit(X_train, y_train)

svm_param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

svm_grid_search = GridSearchCV(svm_clf, svm_param_grid, cv=5, scoring='accuracy')

svm_grid_search.fit(X_train, y_train)

print('SVM best hyperparameters:', svm_grid_search.best_params_)
print('SVM best cross-validation accuracy:', svm_grid_search.best_score_)

svm_clf = svm_grid_search.best_estimator_

svm_start_time = time.time()
svm_clf.fit(X_train, y_train)
svm_end_time = time.time()

svm_test_accuracy = svm_clf.score(X_test, y_test)
print('Time required to train SVM classifier:', svm_end_time - svm_start_time)
print('SVM test accuracy:', svm_test_accuracy)

In [None]:
# Create a random forest classifier and train it on the training data
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

param_grid = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None]
}

rf_grid_search = GridSearchCV(rf_clf, svm_param_grid, cv=5, scoring='accuracy')

rf_grid_search.fit(X_train, y_train)

print('RF best hyperparameters:', rf_grid_search.best_params_)
print('RF best cross-validation accuracy:', rf_grid_search.best_score_)

rf_clf = rf_grid_search.best_estimator_
rf_start_time = time.time()
rf_clf.fit(X_train, y_train)
rf_end_time = time.time()

rf_test_accuracy = rf_clf.score(X_test, y_test)
print('Time required to train RF classifier:', rf_end_time - rf_start_time)
print('RF test accuracy:', rf_test_accuracy)

In [None]:
# Generate predictions for the test set
nn_predictions = nn_clf.predict(X_test)
svm_predictions = svm_clf.predict(X_test)
rf_predictions = rf_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

# Define a dictionary that maps model names to their hyperparameters
model_hyperparams = {
    'Vanilla Neural Network': nn_grid_search.best_params_,
    'SVM': svm_grid_search.best_params_,
    'Random Forest': rf_grid_search.best_params_
}

# Compute the confusion matrix for each model
nn_confusion_matrix = metrics.confusion_matrix(y_test, nn_predictions)
svm_confusion_matrix = metrics.confusion_matrix(y_test, svm_predictions)
rf_confusion_matrix = metrics.confusion_matrix(y_test, rf_predictions)

# Print the confusion matrix and hyperparameters for each model
print('Vanilla Neural Network confusion matrix:')
print(nn_confusion_matrix)
print('Vanilla Neural Network hyperparameters:', model_hyperparams['Vanilla Neural Network'])

print('SVM confusion matrix:')
print(svm_confusion_matrix)
print('SVM hyperparameters:', model_hyperparams['SVM'])

print('Random Forest confusion matrix:')
print(rf_confusion_matrix)
print('Random Forest hyperparameters:', model_hyperparams['Random Forest'])

In [None]:
# Evaluate the performance of each classifier
nn_accuracy = metrics.accuracy_score(y_test, nn_predictions)
svm_accuracy = metrics.accuracy_score(y_test, svm_predictions)
rf_accuracy = metrics.accuracy_score(y_test, rf_predictions)
print("Vanilla Neural Network Accuracy:", nn_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)