# Preliminaries
Data preprocessing in order to feed our CNN. The following considerations are taking into account:

* To read the data, Uproot is faster than Pyroot. However, we do not know if the software can be installed properly at CUDA and CIEMAT computers. 
* Software to reduce the size of the images, but keeping the signal and important caracteristics.
* A method to store this data efficiently. Probably png or csv files.

# READ DATA

In [None]:
########## dependencies ###########
import uproot
import numpy as np
import matplotlib.pyplot as plt

In [None]:
file = uproot.open("0-RecoFull-Parser.root")
print("the data contains in the file has the following format: ",file[file.keys()[0]].keys())
print("We can access the data using")

In [None]:
tree=file["analysistree"]["anatree"] 
tree.keys()

In [None]:
print(tree.array(b'RecoWaveform_Channel')[98].shape) ## 1229 channels
print(tree.array( b'RecoWaveform_NTicks')[98]) ## each channel 1667 ticks , the explanatory pdf is wrong!

* It seems that theere are 1229 channels and 1667 ticks. 
* probably first view from 0 to 279 and second the rest. 

In [None]:
ADC = tree.array( b'RecoWaveform_ADC')
NChannel=tree.array(b'RecoWaveforms_NumberOfChannels')
Nticks=tree.array(b'RecoWaveform_NumberOfTicksInAllChannels')
NTracks=tree.array(b'NumberOfTracks')
w , h = int(NChannel[0]) , int(Nticks[0]/NChannel[0])
print(w,h)

In [None]:
im=ADC[56].reshape((w,h))
v1=im[0:279,:]
fig = plt.figure(frameon = False)
plt.imshow(v1.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(30, 40)##grey scale
plt.savefig('big.png')

In [None]:
def maxpool(im, h, w):
    #####the imputs#####
    # im.shape must be a matrix of(width, height)
    # w and h are the output weight and height respectively .
    
#### preliminaries###
    h_step=im.shape[0]//h
    w_step=im.shape[1]//w
    #print("we have lost", (im.shape[1]%w)*(im.shape[0]%h), "pixels along the way")
    
    reduced_im=np.zeros((h,w)) ##the new reduced matrix is initialized with zeros
    
    
    ########The algorithm#########
    for i in range(0,h): #loop over h
        for j in range(0,w): #loop over w
            pool=im[i*h_step:h_step*(i+1),j*w_step:(j+1)*w_step]
            reduced_im[i,j]=np.max(pool)
            
            
    return reduced_im

In [None]:
def maxpoolmod(im, h, w):
    # observacions, modifications to account for issues regarding h_step ~ 1
    #####the imputs#####
    # im.shape must be a matrix of(width, height)
    # w and h are the output weight and height respectively .
    
#### preliminaries###
    h_step=im.shape[0]//h
    w_step=im.shape[1]//w
    #print("we have lost", (im.shape[1]%w)*(im.shape[0]%h), "pixels along the way")
    
    reduced_im=np.zeros((h,w)) ##the new reduced matrix is initialized with zeros
    extra_pixels=v1.shape[0]-(v1.shape[0]//200)*200
    loss_h = (im.shape[0]//200 *200)/im.shape[0] ##the percented of the image that we will lose
    print(loss_h)
    count=0
    ########The algorithm#########
    for i in range(0,h): #loop over h
        for j in range(0,w): #loop over w
            r=np.random.uniform()
            #print(count,r)
            if r>extra_pixels/h/w and count<extra_pixels:
                pool=im[(i+count)*h_step:h_step*(i+1+count),(j)*w_step:(j+1)*w_step]
                reduced_im[i,j]=np.max(pool)
            if r<extra_pixels/h/w and count<extra_pixels:
                pool=im[(i+count)*h_step:h_step*(i+1+count),(j)*w_step:(j+1)*w_step]
                reduced_im[i,j]=np.max(pool)
                count=count+1     
    return reduced_im ,count

In [None]:
v1.shape[0]-v1.shape[0]//200*200

In [None]:
v1red=maxpool(v1,200,200)
fig = plt.figure(frameon = False)
plt.imshow(v1red.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

v1redmu, count=maxpoolmod(v1,200,200)
print(count)
fig = plt.figure(frameon = False)
plt.imshow(v1redmu.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

# Observaciones:
* the algorithm only works approximately if h and w are <<< im.shape . Otherwise, the algorithm will cut the image. 

* In order to account for this issue, what can be done is 

    1. reduce only the big axis, the one with
    2. modify the algorithm 

v1red=maxpool(v1,200,200)
fig = plt.figure(frameon = False)
plt.imshow(v1red.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

v1red=maxpoolmod(v1,200,200)
fig = plt.figure(frameon = False)
plt.imshow(v1red.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

# Considerations: file size
In order to save memory, our data set must me made of these images. It can be stored in a cv file, or in a root tree. We will see how efficient is the root tree. Options: 

* use csv files with delimiter ',' : 200x200 img is ~900K
* use numpy.save : 200x200 img is ~300K
* use ROOT trees: 200x200 img is ~33K
* use png images: The data is not properly conserved, we do not have the numerical values but rather a png image that approximate this values. 

As far as disc memory is concerned, probably the best option to tackle this problem is using ROOT trees.

In [None]:
np.save('savenp',v1red) ### 
np.savetxt("foo.csv", v1red, delimiter=",")

In [None]:
from ROOT import TFile, TTree
import numpy as np

f = TFile('example.root', 'recreate')
t = TTree('mytree', 'example tree')

t.Branch('myarray', v1red, 'myarray[200][200]/D')
print(v1red.flatten().shape)

t.Fill()

f.Write()
f.Close()

In [None]:
import ROOT
import numpy as np

tfile = ROOT.TFile("example.root")
ttree = tfile.mytree

nentries = 1
for i in range(nentries):
    ttree.GetEntry(i)
    print(ttree.myarray)


In [None]:
v1red.flatten()[0]

In [None]:
fig = plt.figure(frameon = False)
plt.imshow(v1red.flatten().reshape(200,200).T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

In [None]:
file=uproot.open("example.root")
tree=file[b'mytree;1']

In [None]:
im=tree.array( b'myarray')[0]

In [None]:
print(im.shape)
print(v1red.shape)

In [None]:
fig = plt.figure(frameon = False)
plt.imshow(im.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

This reduction algorithim must be uploaded into lxplus and then download the reduced data. Consequently, 1000 $e^{-}$ and 1000 $\mu^{-}$ waveforms must be around 60MB and 1 million waveforms 30 GB. I believe this 30 GB is a very reasonable number. 

We have to see if the image reduction algorithm also works for electrons. 

In [None]:
file = uproot.open("/Users/dan/CIEMAT/ROOT/ROOT_try/electrons/0-RecoFull-Parser.root")
tree=file["analysistree"]["anatree"] 

In [None]:
ADC = tree.array( b'RecoWaveform_ADC')
NChannel=tree.array(b'RecoWaveforms_NumberOfChannels')
Nticks=tree.array(b'RecoWaveform_NumberOfTicksInAllChannels')
NTracks=tree.array(b'NumberOfTracks')
w , h = int(NChannel[0]) , int(Nticks[0]/NChannel[0])
print(w,h)

In [None]:
im=ADC[61].reshape((w,h))
v1=im[0:279,:]
fig = plt.figure(frameon = False)
plt.imshow(v1.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(30, 40) ##grey scale
plt.savefig('big.png')

In [None]:
v1red=maxpool(v1,100,100)
fig = plt.figure(frameon = False)
plt.imshow(v1red.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

v1rede, count =maxpoolmod(v1,100,100)
print(count)
fig = plt.figure(frameon = False)
plt.imshow(v1rede.T,cmap = 'jet',interpolation='none')
fig.set_size_inches(10, 10) ##grey scale
plt.savefig('small.png')

In [None]:
100*100

# Conclusions

Now our data is ready to feed our Deep Neural networks.

Conviene reducir a 100x100 pixeles

* bloque convolutivo
 1. kernel(5...7...9)
 2. bajar después+pooling (2x2)
* salida
 1. confusion matrix
 2. evolución del fitness en función de la época
 


