In [1]:
import os

def list_files_in_directory(directory):
    """List all files in the given directory."""
    try:
        files = os.listdir(directory)
        return [f for f in files if os.path.isfile(os.path.join(directory, f))]
    except FileNotFoundError:
        return f"The directory {directory} does not exist."
    except PermissionError:
        return f"Permission denied to access the directory {directory}."

In [None]:
files = list_files_in_directory('../Dataset')
roots = [f for f in files if '.root' in f]
roots

In [None]:
import ROOT
import numpy as np


os.chdir("../Dataset")

In [5]:
myFile = ROOT.TFile.Open(roots[0])
histnames = 'h_SINT_1_0_'

#Takes single file and histogram and returns the value inside
def extract_single_histogram(root_file, hist, label):
    data = root_file[hist]
    bins = data.GetNbinsX()
    vals = [data.GetBinContent(i) for i in range(1, bins+1)]
    vals = np.append(vals, [label])

    return vals

extract_single_histogram(myFile, histnames+'1', 0)

array([361., 360., 358., 364., 371., 375., 379., 375., 368., 369., 364.,
       361., 355., 359., 359., 357., 359., 362., 355., 353., 358., 359.,
       364., 369., 362., 362., 364., 369., 394., 440., 488., 531., 572.,
       609., 646., 675., 697., 719., 739., 744., 750., 750., 755., 761.,
       765., 768., 772., 779., 779., 787., 788., 789., 788., 790., 795.,
       796., 800., 801., 800., 805., 808., 811., 820., 826., 818., 822.,
       823., 822., 814., 812., 817., 812., 808., 810., 805., 804., 806.,
       809., 815., 816., 817., 817., 815., 815., 813., 818., 818., 819.,
       822., 819., 816., 814., 814., 817., 815., 816., 814., 816., 821.,
       824.,   0.])

In [6]:
def all_hist_file(root_file, label):
    #for i in range(0, root_file.GetNkeys()):
        #fileHists = np.append(fileHists, extract_single_histogram(root_file, histnames+str(i)))
    fileHists = [extract_single_histogram(root_file, histnames+str(i), label) for i in range(0, root_file.GetNkeys())]

    fileHists = np.array(fileHists)
    fileHists = fileHists.reshape(-1, 101)
    return fileHists

In [7]:
all_hist_file(myFile, label=0)

array([[380., 382., 381., ..., 837., 838.,   0.],
       [361., 360., 358., ..., 821., 824.,   0.],
       [381., 389., 391., ..., 833., 837.,   0.],
       ...,
       [375., 376., 383., ..., 830., 828.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]], shape=(11102, 101))

In [None]:
#Queste sono le Run di Trizio(A=3, Z=1, Q=1) e di Alpha(A=4 Z=2 Q=2) con good = 0/1

tritium_run = [167765, 167766, 167767, 167768, 167865, 167866, 167867, 167868, 167869, 167870, 167871, 167872]
alpha_run = [167853, 167857, 167858, 167859, 167863, 167864, 167917, 167918, 167919]

In [None]:
tritium_detections = np.zeros(101)

for i in tritium_run:
    tritium_values = all_hist_file(ROOT.TFile.Open(str(i)+'.root'), label=0)
    tritium_values = tritium_values[~np.all(tritium_values==0, axis=1)]
    tritium_detections = np.vstack((tritium_detections, tritium_values))
    print(tritium_detections.shape)


In [None]:
alpha_detections = np.zeros(101)

for i in alpha_run:
    alpha_values = all_hist_file(ROOT.TFile.Open(str(i)+'.root'), label=1)
    alpha_values = alpha_values[~np.all(alpha_values==0, axis=1)]
    alpha_detections = np.vstack((alpha_detections, alpha_values))
    print(alpha_detections.shape)

In [None]:
tritium_detections = tritium_detections[1:]

alpha_detections = alpha_detections[1:]

In [None]:
import csv
os.chdir('../Notebooks')

with open("dataset.csv","w") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows(tritium_detections)

In [None]:
import csv
os.chdir('../Notebooks')

with open("dataset.csv","a") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows(alpha_detections)

# Rebalancing

The dataset presents heavy class imbalance twoards tritium, numbers are presented below, thus to prevent imbalance and overfitting we have to take measure to ensure that the minority class is not ignored

In [81]:
import os
import pandas as pd

df = pd.read_csv("dataset.csv", header=None, names=[str(i) for i in range(1, 100)]+['label'])


In [None]:
df_new = df.reset_index()


In [None]:

print((df_new[df_new['label'] == 0]).count())
print((df_new[df_new['label'] == 1]).count())

In [89]:
df_new = df_new.rename(columns={'index':'0'})
print(df_new)

            0      1      2      3      4      5      6      7      8      9  \
0       362.0  363.0  364.0  368.0  371.0  365.0  366.0  368.0  369.0  365.0   
1       385.0  380.0  381.0  382.0  377.0  374.0  373.0  374.0  374.0  370.0   
2       340.0  343.0  345.0  344.0  336.0  335.0  337.0  340.0  341.0  342.0   
3       377.0  378.0  382.0  378.0  377.0  377.0  374.0  379.0  374.0  378.0   
4       386.0  382.0  383.0  385.0  387.0  382.0  377.0  381.0  381.0  380.0   
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
137320  360.0  356.0  356.0  358.0  358.0  353.0  357.0  358.0  363.0  360.0   
137321  366.0  360.0  365.0  368.0  375.0  376.0  376.0  375.0  370.0  372.0   
137322  379.0  381.0  385.0  389.0  389.0  387.0  386.0  387.0  391.0  390.0   
137323  376.0  377.0  381.0  380.0  379.0  375.0  376.0  372.0  367.0  370.0   
137324  365.0  364.0  366.0  362.0  364.0  369.0  367.0  367.0  367.0  370.0   

        ...     91     92     93     94

#### Il class imbalance è di 128534 contro 8791 (Trizio / alpha) il rateo è di 14.62

In [None]:
import sklearn
import imblearn 


sm = imblearn.over_sampling.SMOTE(random_state = 42)
print(sm)

SMOTE(random_state=42)


In [91]:
data = df_new.drop('label', axis=1)
labels = df_new['label']


In [93]:
X, y = sm.fit_resample(data, labels)

In [None]:
balanced_df = pd.concat([X, y], axis=1)

In [97]:
print(balanced_df)

                 0           1           2           3           4  \
0       362.000000  363.000000  364.000000  368.000000  371.000000   
1       385.000000  380.000000  381.000000  382.000000  377.000000   
2       340.000000  343.000000  345.000000  344.000000  336.000000   
3       377.000000  378.000000  382.000000  378.000000  377.000000   
4       386.000000  382.000000  383.000000  385.000000  387.000000   
...            ...         ...         ...         ...         ...   
257063  374.389756  374.702572  374.015388  371.328204  369.030776   
257064  381.123381  377.928576  371.000000  368.487011  364.168826   
257065  383.062439  384.062439  387.438045  387.562924  385.875121   
257066  372.000000  371.463960  368.418274  364.627412  366.209137   
257067  367.480661  368.844198  366.051934  369.051934  373.688397   

                 5           6           7           8           9  ...  \
0       365.000000  366.000000  368.000000  369.000000  365.000000  ...   
1       3

In [107]:
print((balanced_df[balanced_df['label'] == 0]).count())
print((balanced_df[balanced_df['label'] == 1]).count())

0        128534
1        128534
2        128534
3        128534
4        128534
          ...  
96       128534
97       128534
98       128534
99       128534
label    128534
Length: 101, dtype: int64
0        128534
1        128534
2        128534
3        128534
4        128534
          ...  
96       128534
97       128534
98       128534
99       128534
label    128534
Length: 101, dtype: int64


In [109]:
import csv
os.chdir('../Notebooks')

with open("balanced_dataset.csv","w") as my_csv:
    balanced_df.to_csv('balanced_dataset.csv')

#### Classes have been rebalanced, we will perform tests on both situations