Read raw data from classification and change to easier-to-use format

In [1]:
import os
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read datasets and extract labels

In [2]:
f_dir = './data_processed/'
fp_total = f_dir + 'Classified total Dataset.csv'
fp_acceptance = f_dir + 'Classified_acceptance_mc.csv'

In [3]:
def extract_classification(fp):
    # extract classification and relevant variables 
    # save resulting file with .bz2 extension 
    
    df = pd.read_csv(fp)
    
    df['Classification'] = df['Classification'].astype('category')
    df = df[['B0_MM', 'Kstar_MM', 'J_psi_MM', 'q2', \
             'phi', 'costhetal', 'costhetak', \
             'Classification']] # extract only angular variables and q2
    
    print(f'before: {len(df.index)}')
    df = df[df['Classification'] == 0] # keep only signal values 
    print(f'after: {len(df.index)}')
    
    save_fp = fp[:-4] + '.bz2'
    df.to_pickle(save_fp)
    
    return df

In [4]:
df_total = extract_classification(fp_total)
df_total.describe()

before: 498245
after: 41300


Unnamed: 0,B0_MM,Kstar_MM,J_psi_MM,q2,phi,costhetal,costhetak
count,41300.0,41300.0,41300.0,41300.0,41300.0,41300.0,41300.0
mean,5308.926799,940.387595,3144.616153,10.095175,0.006103,-0.0032,-0.106982
std,100.416702,98.039554,454.377676,2.590921,1.811489,0.520525,0.57976
min,5128.011574,740.036,218.800325,0.047873,-3.141589,-0.99998,-0.999862
25%,5254.341746,876.540205,3086.767976,9.528269,-1.552704,-0.423175,-0.622256
50%,5283.436507,919.62186,3099.743528,9.608578,-0.003885,-0.006271,-0.162901
75%,5330.029746,1015.715549,3117.1415,9.716619,1.574583,0.417049,0.38549
max,5699.935632,1149.974,5009.982,25.100191,3.141488,0.999736,0.999992


In [5]:
df_acceptance = extract_classification(fp_acceptance)
df_acceptance.describe()

before: 716859
after: 635239


Unnamed: 0,B0_MM,Kstar_MM,J_psi_MM,q2,phi,costhetal,costhetak
count,635239.0,635239.0,635239.0,635239.0,635239.0,635239.0,635239.0
mean,5277.757494,908.816391,2998.165507,9.999623,-0.003468,0.007866,-0.057183
std,25.281558,54.94968,1005.238518,5.486524,1.820435,0.554846,0.542337
min,5090.396742,740.079435,212.901315,0.045329,-3.141588,-0.99998,-0.999981
25%,5268.210295,880.395653,2319.478552,5.38006,-1.577938,-0.455549,-0.516783
50%,5280.235965,899.744547,3227.01659,10.413646,-0.006833,0.009835,-0.072489
75%,5291.417953,925.211476,3869.70105,14.974891,1.571142,0.475474,0.389796
max,5723.484919,1149.996548,4598.185229,21.138934,3.141592,1.0,0.999994


## Reconstruct classified dataset 

In [6]:
for fp in sorted(glob.glob(f_dir + '*.bz2')):
    df = pd.read_pickle(fp)
    print(fp)
    print(df.describe())
    print()

./data_processed/Classified total Dataset.bz2
              B0_MM      Kstar_MM      J_psi_MM            q2           phi  \
count  41300.000000  41300.000000  41300.000000  41300.000000  41300.000000   
mean    5308.926799    940.387595   3144.616153     10.095175      0.006103   
std      100.416702     98.039554    454.377676      2.590921      1.811489   
min     5128.011574    740.036000    218.800325      0.047873     -3.141589   
25%     5254.341746    876.540205   3086.767976      9.528269     -1.552704   
50%     5283.436507    919.621860   3099.743528      9.608578     -0.003885   
75%     5330.029746   1015.715549   3117.141500      9.716619      1.574583   
max     5699.935632   1149.974000   5009.982000     25.100191      3.141488   

          costhetal     costhetak  
count  41300.000000  41300.000000  
mean      -0.003200     -0.106982  
std        0.520525      0.579760  
min       -0.999980     -0.999862  
25%       -0.423175     -0.622256  
50%       -0.006271     -0