# Creating Dataset

This notebook is about creating a network train dataset with input <code>[eta,deta,dphi,theta,qop,pt,phi,z0,dz0,do,dhadpt,q,n]</code> and lebel efficiency<br>
The code is about creating a tensor contains tensor contain input e.g. <code>[input1,input2,...]</code> and tensor with lebel <code>[eff1,eff2,....]</code><br>
Then choose some of the data in the tensor to be the test dataset and the rest train dataset

## Import lib needed

In [1]:
import numpy as np
import torch
import os

In [2]:
# load data
# ref main.ipynb

data = np.load("testevals/data_10k.npz")

# Convert the data from dumpy arrays to pytorch tensors
data = {k: torch.from_numpy(v) for k, v in data.items()}

In [3]:
# ref plot.py
from metrics import calc_match_scores

def calc_scores(reco_method,match_thresh = 0.75):
    match_metric = "tmp"

    pred_pix_valid = data[f"{reco_method}_pix_valid"]
    pred_sct_valid = data[f"{reco_method}_sct_valid"]
    pred_valid = data[f"{reco_method}_valid"]

    true_pix_valid = data["sudo_pix_valid"]
    true_sct_valid = data["sudo_sct_valid"]
    true_valid = data["sudo_valid"]

    match_score = calc_match_scores(pred_pix_valid, pred_sct_valid, pred_valid, true_pix_valid, true_sct_valid, true_valid, match_metric)
    matches = match_score  >= match_thresh

    true_has_match = matches.any(1) & true_valid
    pred_has_match = matches.any(2) & pred_valid
        
    true_num = true_valid.sum(-1)
    pred_num = pred_valid.sum(-1)
        
    true_num_matched = true_has_match.sum(-1)
    pred_num_matched = pred_has_match.sum(-1)

    roi_eff = true_num_matched / true_num # matched num / true num
    roi_pur = pred_num_matched / pred_num # matched num / predicted num
    print(roi_eff)
    print(roi_pur)
    return pred_has_match , roi_eff, roi_pur

pred_pred_has_match, pred_roi_eff, pred_roi_pur = calc_scores("pred")
sisp_pred_has_match, sisp_roi_eff, sisp_roi_pur = calc_scores("sisp")
reco_pred_has_match, reco_roi_eff, reco_roi_pur = calc_scores("reco")
sudo_pred_has_match, sudo_roi_eff, sudo_roi_pur = calc_scores("sudo")

pred_has_match = {"pred":pred_pred_has_match,"sisp":sisp_pred_has_match,"reco":reco_pred_has_match,"sudo":sudo_pred_has_match}
roi_pur = {"pred":pred_roi_pur,"sisp":sisp_roi_pur,"reco":reco_roi_pur,"sudo":sudo_roi_pur}
roi_eff = {"pred":pred_roi_eff,"sisp":sisp_roi_eff,"reco":reco_roi_eff,"sudo":sudo_roi_eff}

tensor([1.0000, 0.8571, 0.8000,  ..., 1.0000, 1.0000, 1.0000])
tensor([0.6250, 0.8571, 0.8889,  ..., 1.0000, 1.0000, 1.0000])
tensor([1.0000, 1.0000, 0.9000,  ..., 1.0000, 1.0000, 1.0000])
tensor([0.5556, 0.4375, 0.5556,  ..., 0.7500, 1.0000, 1.0000])
tensor([1.0000, 0.8571, 0.8000,  ..., 1.0000, 1.0000, 1.0000])
tensor([0.7143, 0.7500, 0.7273,  ..., 1.0000, 1.0000, 1.0000])
tensor([1., 1., 1.,  ..., 1., 1., 1.])
tensor([1., 1., 1.,  ..., 1., 1., 1.])


In [4]:
def calculate_quanity(qtys = "deta"):
    matched_reco = torch.tensor([])
    unmatched_reco = torch.tensor([])

    for idx in range(pred_has_match["pred"].shape[0]):
        #print(pred_has_match["pred"][idx] )
        #print(data["pred_valid"][idx])
        matched_reco = torch.cat((matched_reco,data[f"sudo_{qtys}"][idx][pred_has_match["pred"][idx] & data["pred_valid"][idx]]),0)
        unmatched_reco = torch.cat((unmatched_reco,data[f"sudo_{qtys}"][idx][~pred_has_match["pred"][idx] & data["pred_valid"][idx]]),0)
    
    return matched_reco, unmatched_reco

In [5]:
# Test the funciton
matched_data, unmatched_data = calculate_quanity()
print(matched_data)
print(matched_data.shape)
print(unmatched_data)
print(unmatched_data.shape)

tensor([-0.0037, -0.0037,  0.0046,  ..., -0.0452, -0.0272,  0.0388])
torch.Size([60438])
tensor([0.0000, 0.0000, 0.0000,  ..., 0.0002, 0.0000, 0.0000])
torch.Size([9874])


In [6]:
# Calculate number of hits of reconstructed tracks
def calculate_hits():
    hit_matched = torch.tensor([])
    hit_unmatched = torch.tensor([])
    for idx in range(pred_has_match["pred"].shape[0]):
        for idx2 in range(data["pred_valid"][idx].shape[0]):
            if data["pred_valid"][idx][idx2]:
                #print(pred_has_match["pred"][idx][idx2])
                if pred_has_match["pred"][idx][idx2]:
                    #print(data["sudo_pix_valid"][idx][idx2])
                    hit = torch.tensor([data["pred_pix_valid"][idx][idx2].count_nonzero()+data["pred_sct_valid"][idx][idx2].count_nonzero()])
                    hit_matched = torch.cat((hit_matched,hit),dim=0)
                else:
                    hit = torch.tensor([data["pred_pix_valid"][idx][idx2].count_nonzero()+data["pred_sct_valid"][idx][idx2].count_nonzero()])
                    hit_unmatched = torch.cat((hit_unmatched,hit),dim=0)
                    #print(hit)
    #print(hit_matched)
    #print(hit_matched.shape)
    #print(hit_unmatched)
    #print(hit_unmatched.shape)
    return hit_matched, hit_unmatched

In [7]:
calculate_hits()

(tensor([13., 12., 12.,  ..., 13., 12., 11.]),
 tensor([12., 11., 12.,  ...,  7.,  2., 10.]))

Seems we have 60438 matched track and 9874 unmatched track<br>
I choose first 3k matched track and first 500 unmatched data as my test dataset

In [8]:
num_test_good = 3000 # let first 3k of the good reco track be test set
num_test_bad = 500 # let first 500 of the bad reco track be test set

In [9]:
quantity = ["pt","eta","deta","phi","dphi","theta","z0","dz0","d0","q","qop","bhadpt"]

data_test_label = torch.cat((torch.tensor([1,0]).repeat(num_test_good,1),torch.tensor([0,1]).repeat(num_test_bad,1)),dim = 0)
data_train_label_good_reco = torch.tensor([1,0]).repeat(matched_data.shape[0]-num_test_good,1)
data_train_label_bad_reco = torch.tensor([0,1]).repeat(unmatched_data.shape[0]-num_test_bad,1)
data_train_label = torch.cat([data_train_label_good_reco,data_train_label_bad_reco],dim = 0)
#data_test_input = torch.cat((hit_matched[0:3000],hit_unmatched[0:300]))



In [10]:
hit_matched, hit_unmatched = calculate_hits()
pt_matched , pt_unmatched = calculate_quanity("pt")
eta_matched , eta_unmatched = calculate_quanity("eta")
deta_matched , deta_unmatched = calculate_quanity("deta")
phi_matched , phi_unmatched = calculate_quanity("phi")
dphi_matched , dphi_unmatched = calculate_quanity("dphi")
theta_matched , theta_unmatched = calculate_quanity("theta")
z0_matched , z0_unmatched = calculate_quanity("z0")
dz0_matched , dz0_unmatched = calculate_quanity("dz0")
d0_matched , d0_unmatched = calculate_quanity("d0")
q_matched , q_unmatched = calculate_quanity("q")
qop_matched , qop_unmatched = calculate_quanity("qop")
bhadpt_matched , bhadpt_unmatched = calculate_quanity("bhadpt")

In [11]:
data_test_input = torch.tensor([])
for i in range(num_test_good):
    data_test_input = torch.cat((data_test_input,torch.tensor([[hit_matched[i]/18,
                                 pt_matched[i]/2.e5,
                                 0.5 + eta_matched[i]/5,
                                 0.5 + deta_matched[i]/0.1,
                                 #phi_matched[i],
                                 0.5 + dphi_matched[i]/0.1,
                                 theta_matched[i]/np.pi,
                                 #z0_matched[i],
                                 #dz0_matched[i],
                                 #d0_matched[i],
                                 #q_matched[i],
                                 0.5 + qop_matched[i]/1,
                                 #bhadpt_matched[i],
                                 ]])),dim = 0)
for i in range(num_test_bad):
    data_test_input = torch.cat((data_test_input,torch.tensor([[hit_unmatched[i]/18,
                                 pt_unmatched[i]/2.e5,
                                 0.5 + eta_unmatched[i]/5,
                                 0.5 + deta_unmatched[i]/0.1,
                                 #phi_unmatched[i],
                                 0.5 + dphi_unmatched[i]/0.1,
                                 theta_unmatched[i]/np.pi,
                                 #z0_unmatched[i],
                                 #dz0_unmatched[i],
                                 #d0_unmatched[i],
                                 #q_unmatched[i],
                                 0.5 + qop_unmatched[i]/1,
                                 #bhadpt_unmatched[i],
                                 ]])),dim = 0)
print(data_test_input.shape)

torch.Size([3500, 7])


In [12]:
data_train_input = torch.tensor([])
for i in range(num_test_good,matched_data.shape[0]):
    data_train_input = torch.cat((data_train_input,torch.tensor([[hit_matched[i]/18,
                                 pt_matched[i]/2.e5,
                                 0.5 + eta_matched[i]/5,
                                 0.5 + deta_matched[i]/0.1,
                                 #phi_matched[i],
                                 0.5 + dphi_matched[i]/0.1,
                                 theta_matched[i]/np.pi,
                                 #z0_matched[i],
                                 #dz0_matched[i],
                                 #d0_matched[i],
                                 #q_matched[i],
                                 0.5 + qop_matched[i]/1,
                                 #bhadpt_matched[i],
                                 ]])),dim = 0)
for i in range(num_test_bad,unmatched_data.shape[0]):
    data_train_input = torch.cat((data_train_input,torch.tensor([[hit_unmatched[i]/18,
                                 pt_unmatched[i]/2.e5,
                                 0.5 + eta_unmatched[i]/5,
                                 0.5 + deta_unmatched[i]/0.1,
                                 #phi_unmatched[i],
                                 0.5 + dphi_unmatched[i]/0.1,
                                 theta_unmatched[i]/np.pi,
                                 #z0_unmatched[i],
                                 #dz0_unmatched[i],
                                 #d0_unmatched[i],
                                 #q_unmatched[i],
                                 0.5 + qop_unmatched[i]/1,
                                 #bhadpt_unmatched[i],
                                 ]])),dim = 0)
print(data_train_input.shape)

torch.Size([66812, 7])


In [13]:
print(f"Train dataset input(good) size = {data_train_input.shape}")
print(f"Train dataset label(good) size = {data_train_label.shape}")
print(f"Test dataset input size = {data_test_input.shape}")
print(f"Test dataset label size = {data_test_label.shape}")

folder = "75_dataset"
try:
    os.makedirs(f"data/{folder}")
except:
    print("Path already created")
torch.save(data_train_input,f"data/{folder}/data_train_input.csv")
torch.save(data_train_label,f"data/{folder}/data_train_label.csv")
torch.save(data_test_input,f"data/{folder}/data_test_input.csv")
torch.save(data_test_label,f"data/{folder}/data_test_label.csv")

Train dataset input(good) size = torch.Size([66812, 7])
Train dataset label(good) size = torch.Size([66812, 2])
Test dataset input size = torch.Size([3500, 7])
Test dataset label size = torch.Size([3500, 2])
