Now we will prepare the data for testing and training

In [170]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

def compute_fom(S, B):
    return S / np.sqrt(S + B)

def normalize_data(col, dataset):
    col_data = dataset[col]
    new_col = f"{col}_normal"
    normal = (col_data - col_data.mean()) / col_data.std()
    dataset.loc[:, new_col] = normal

# Marks test_perc percentage of data as testing data
def create_training_data(test_perc, max_cutoff = 0):
    data_gam = pd.read_csv("emc_gam.txt")
    data_neutron = pd.read_csv("emc_neutron.txt")

    print(f"Marking {(1 - test_perc) * 100:.0f}% of data as training data")

    data_neutron["IsPhoton"] = pd.Series(0, index = data_neutron.index, dtype = "int32")
    if max_cutoff != 0:
        print(f"Cutting off data at {max_cutoff} entries")
        data_neutron = data_neutron[:(max_cutoff // 2)]

    data_gam["IsPhoton"] = pd.Series(1, index = data_gam.index, dtype = "int32")
    if max_cutoff != 0:
        data_gam = data_gam[:(max_cutoff // 2)]

    merged = pd.concat([data_neutron, data_gam])
    merged["IsTrain"] = np.random.uniform(0, 1, len(merged)) <= (1 - test_perc)

    merged_train, merged_test = merged[(merged["IsTrain"] == True)].copy(), merged[(merged["IsTrain"] == False)].copy()

    features = ["Th", "Ph", "E", "NrHits", "NrBumps", "E1", "E1E9", "E9E25", "Z20", "Z53", "LatMom"]
    for feat in features:
        normalize_data(feat, merged_train)
        normalize_data(feat, merged_test)

    return merged_train, merged_test

train, test = create_training_data(0.3, 100)


Marking 70% of data as training data
Cutting off data at 100 entries
1     0.009551
2    -1.541031
3     0.031754
4     0.388156
5     0.141089
        ...   
43    0.619483
44   -1.203296
45    0.480820
47    1.057505
48    0.084561
Name: Th_normal, Length: 71, dtype: float64
0    -1.072705
8    -0.769298
10   -0.442869
17    1.496255
21    0.631387
24    1.589395
25   -0.045835
31    1.057718
33   -0.660545
38    0.355575
44    0.851349
47    1.027786
49    0.269674
0    -1.050317
14   -1.054509
15   -0.945981
18    1.584569
20   -0.378293
21   -1.760937
26    0.374423
27    0.493075
28   -1.232985
30    0.739624
35   -1.406837
38   -0.667265
40    1.330448
42   -1.169722
46    0.668103
49    0.188717
Name: Th_normal, dtype: float64
1     0.495010
2    -0.171227
3     0.017295
4    -0.876476
5     0.368529
        ...   
43   -0.795912
44    0.490621
45   -0.594540
47   -0.837049
48   -1.672850
Name: Ph_normal, Length: 71, dtype: float64
0    -0.199648
8     1.292900
10   -1.962196
1

In [167]:
normal_features = ["Th_normal", "Ph_normal", "E_normal", "NrHits_normal", "NrBumps_normal", "E1_normal", "E1E9_normal", "E9E25_normal", "Z20_normal", "Z53_normal", "LatMom_normal"]

def count_successes(pred):
    train_size = len(pred[(pred.IsTrain == True)])
    test_size = len(pred[(pred.IsTrain == False)])

    train_successes = len(pred[(pred["IsTrain"] == True) & (pred["Prediction"] == pred["IsPhoton"])])
    test_successes = len(pred[(pred["IsTrain"] == False) & (pred["Prediction"] == pred["IsPhoton"])])

    return (train_successes / train_size), (test_successes / test_size)

def decision_surface_plot(clf, predictions, features):


    return

def knn_classify(k, train_data, test_data, features):
    # print(train_data["IsPhoton"])

    clf = KNeighborsClassifier(k)
    clf.fit(train_data[features], train_data["IsPhoton"])

    print("Train Count: ", len(train_data[(train_data["IsPhoton"] == 1)]))

    # Evaluate the training points
    train_pred = clf.predict(train_data[features])
    train_data.loc[:, "Prediction"] = [x for x in train_pred]

    test_pred = clf.predict(test_data[features])
    test_data.loc[:, "Prediction"] = [x for x in test_pred]

    pred_data = pd.concat([train_data, test_data])

    train_succ, test_succ = count_successes(pred_data)

    # print(train_succ, test_succ)

def classify(method, k, test_data, train_data, features):
    if method == "knn":
        return knn_classify(k, test_data, train_data, features)
    else:
        return

print(test["NrBumps_normal"])

classify("knn", 15, test, train, normal_features)

0    NaN
4    NaN
7    NaN
9    NaN
11   NaN
16   NaN
20   NaN
27   NaN
28   NaN
30   NaN
38   NaN
44   NaN
48   NaN
0    NaN
6    NaN
8    NaN
12   NaN
14   NaN
17   NaN
19   NaN
20   NaN
22   NaN
26   NaN
27   NaN
28   NaN
29   NaN
35   NaN
37   NaN
38   NaN
40   NaN
44   NaN
Name: NrBumps_normal, dtype: float64


ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values