# CSV file processing

In this notebook I'll test the macro, written in python, used for converting a ROOT structure (Tree) into a CSV file for ML analysis.

The CSV is located inside the directory output, created by the python macro **examplemacro.py**:

In [None]:
!ls ../MuonPOGAnalysisTemplate/

In [None]:
%cd ../MuonPOGAnalysisTemplate/output
!head -5 output_muons.csv

To analyze it, we need the ROOT python module and pandas for the CSV analysis:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import progressbar

Now we need to read the CSV:

In [None]:
df2 = pd.read_csv('./output_muons.csv')
df2

We sort the dataframe with ascending order of sector primitive

In [None]:
df2 = df2.sort_values(["Event","dtPrimitive.id_r"])
df2 = df2.reset_index(drop=True)
df2
#df2["dtPrimitive.phiB"] = df2["dtPrimitive.phiGlb()"] + df2["dtPrimitive.phiB"]/512.

### Change CSV structure for a suitable ML format

Using the following code, the structure of the input CSV changes. Instead of considering each line as a single primitive, now each line represents a single event (with multiple primitives structured by columns).

The main difference, therefore, is an increasing number of columns: one for each primitive.

In [None]:
bar = progressbar.ProgressBar(maxval=len(df2.index), widgets = [progressbar.Bar('=','[',']'), '', progressbar.Percentage()])
bar.start()
a = 1
temp = 0
new_column = []
row_list=[]
final_row=[]
for column in df2.columns.values.tolist():
    if column==df2.columns.values.tolist()[0]:
        new_column.append(column)
        new_column.append("n_Primitive")
        continue
    for count in range(1,5):
        if "()" in column:
            column = column.replace("()", "")
        new_column.append(str(count) + column)
df = pd.DataFrame(columns=new_column)
for index, row in df2.iterrows():
    if (row["Event"] == a):
        if row["dtPrimitive.id_r"] > temp:
            row_list.append(row.tolist())
            temp = row["dtPrimitive.id_r"]
            continue
        del row_list[:]
        temp = 999
        continue
    elif(row["Event"] != a):
        for i in range(0,len(df2.columns)):
            if not row_list:
                continue
            if i==0:
                final_row.append(row_list[0][0])
                final_row.append(len(row_list))
                continue
            for j in range(0,len(row_list)):
                if len(row_list)==4:
                    final_row.append(row_list[j][i])
                else:
                    final_row.append(row_list[j][i])
                    if j==len(row_list)-1:
                        final_row += [np.nan]*(4-len(row_list))
        a = row["Event"]
        if len(row_list) <=4:
            if final_row:
                df.loc[row["Event"]-1] = final_row
        del row_list[:]
        del final_row[:]
        temp = 0
        row_list.append(row.tolist())
    bar.update(index+1)

for i in range(0,len(df2.columns)):
    if not row_list:
        continue
    if i==0:
        final_row.append(row_list[0][0])
        final_row.append(len(row_list))
        continue
    for j in range(0,len(row_list)):
        if len(row_list)==4:
            final_row.append(row_list[j][i])
        else:
            final_row.append(row_list[j][i])
            if j==len(row_list)-1:
                final_row += [np.nan]*(4-len(row_list))
if len(row_list) <=4:
    if final_row:
        df.loc[row["Event"]] = final_row
bar.finish()

Next it is important to remove the column that aren't essential such as genParticle duplicate

In [None]:
df = df.rename(columns = {"1genParticle.pt":"genParticle.pt","1genParticle.phi":"genParticle.phi","1genParticle.eta":"genParticle.eta","1genParticle.pdgId":"genParticle.pdgId","1genParticle.status":"genParticle.status"})

In [None]:
for column in df.columns.values.tolist():
    if column[0] == '5':
        df = df.drop(column,axis=1)

In [None]:
for i in range(2,5):
    title = str(i) + "l1Muon.pt"
    df = df.drop(title,1)

In [None]:
for i in range(2,5):
    title = str(i) + "genParticle.pt"
    df = df.drop(title,1)

In [None]:
for i in range(2,5):
    title = str(i) + "genParticle.eta"
    df = df.drop(title,1)

In [None]:
for i in range(2,5):
    title = str(i) + "genParticle.phi"
    df = df.drop(title,1)

In [None]:
for i in range(2,5):
    title = str(i) + "genParticle.pdgId"
    df = df.drop(title,1)

In [None]:
for i in range(2,5):
    title = str(i) + "genParticle.status"
    df = df.drop(title,1)

Insert columns relative to delta phi angles between sectors.

In [None]:
cols = df.columns.values.tolist()
df.insert(len(cols)-1,"delta_phi34", abs(df["3dtPrimitive.phiGlb"]-df["4dtPrimitive.phiGlb"]))
df.insert(len(cols)-1,"delta_phi24", abs(df["2dtPrimitive.phiGlb"]-df["4dtPrimitive.phiGlb"]))
df.insert(len(cols)-1,"delta_phi23", abs(df["2dtPrimitive.phiGlb"]-df["3dtPrimitive.phiGlb"]))
df.insert(len(cols)-1,"delta_phi14", abs(df["1dtPrimitive.phiGlb"]-df["4dtPrimitive.phiGlb"]))
df.insert(len(cols)-1,"delta_phi13", abs(df["1dtPrimitive.phiGlb"]-df["3dtPrimitive.phiGlb"]))
df.insert(len(cols)-1,"delta_phi12", abs(df["1dtPrimitive.phiGlb"]-df["2dtPrimitive.phiGlb"]))

In [None]:
for column in df.columns.values.tolist():
    if "phiGlb" in column:
        df = df.drop(column,axis=1)

Let's plot a correlation matrix for each variable inside the csv:

In [None]:
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns,rotation=90);
    plt.yticks(range(len(corr.columns)), corr.columns);
    plt.imshow(corr, aspect=1)

In [None]:
plot_corr(df)
plt.colorbar()
plt.show()

Then put the organized table inside a csv file.

In [None]:
df

In [None]:
df.to_csv("/Users/tommaso/TESI_MAGISTRALE/MuonPOGAnalysisTemplate/output/bxcut_full_3.csv",na_rep=0,index=False)