In [88]:
import pandas as pd
import numpy as np
from mrmr import mrmr_classif

In [89]:
bkg = pd.read_csv('background_train.csv')
sig = pd.read_csv('signal_train.csv')

#remove labels exclusive to one dataset
colsToRemove = set(np.concatenate([list(set(sig.columns)-set(bkg.columns)), list(set(bkg.columns)-set(sig.columns))]))


bkg.drop(columns=colsToRemove, inplace = True)
sig.drop(columns=colsToRemove, inplace = True)

bkg.columns, sig.columns

(Index(['CorsikaWeightMap.AreaSum', 'CorsikaWeightMap.Atmosphere',
        'CorsikaWeightMap.CylinderLength', 'CorsikaWeightMap.CylinderRadius',
        'CorsikaWeightMap.DiplopiaWeight', 'CorsikaWeightMap.EnergyPrimaryMax',
        'CorsikaWeightMap.EnergyPrimaryMin', 'CorsikaWeightMap.FluxSum',
        'CorsikaWeightMap.Multiplicity', 'CorsikaWeightMap.SpectralIndexChange',
        ...
        'NewAtt.DirectEllipse', 'NewAtt.DeltaZd', 'NewAtt.AbsSmooth',
        'NewAtt.emptyness', 'NewAtt.SepDevide', 'NewAtt.SPEBayVerRadius',
        'NewAtt.SplineVerRadius', 'CorsikaWeightMap.ParticleType',
        'CorsikaWeightMap.Polygonato', 'CorsikaWeightMap.PrimarySpectralIndex'],
       dtype='object', length=283),
 Index(['CorsikaWeightMap.AreaSum', 'CorsikaWeightMap.Atmosphere',
        'CorsikaWeightMap.CylinderLength', 'CorsikaWeightMap.CylinderRadius',
        'CorsikaWeightMap.DiplopiaWeight', 'CorsikaWeightMap.EnergyPrimaryMax',
        'CorsikaWeightMap.EnergyPrimaryMin', 'CorsikaWei

In [90]:
#remove non value rows
bkg.replace([np.inf, -np.inf], np.nan, inplace=True)
sig.replace([np.inf, -np.inf], np.nan, inplace=True)

In [91]:
#Columns with above 10% nan/inf get dropped all other nan/inf rows are dropped
for col in bkg.columns:
    bkgNaRatio = np.sum(bkg[col].isna())/len(bkg)
    sigNaRatio = np.sum(sig[col].isna())/len(sig)
    maxNaRatio = max(bkgNaRatio, sigNaRatio)
    if maxNaRatio > .1:
        bkg.drop(columns = col, inplace = True)
        sig.drop(columns = col, inplace = True)
bkg.dropna(inplace = True)
sig.dropna(inplace = True)

In [92]:
#Remove MC truths
truthList = ['MC', 'Weight', 'Corsika', 'I3EventHeader']
for col in bkg.columns:
    if any(sg in col for sg in truthList):
        bkg.drop(columns = col, inplace = True)
        sig.drop(columns = col, inplace = True)

In [97]:
#Combine Datasets
bkg['label'] = 0
sig['label'] = 1
df = pd.concat([bkg,sig])

df['label']

0        0
1        0
2        0
3        0
4        0
        ..
17928    1
17929    1
17930    1
17931    1
17932    1
Name: label, Length: 35652, dtype: int64

In [99]:
#Feature Selection arbitrarlily choose k=33
selected_features = mrmr_classif(X=df.drop(columns=['label']), y=df['label'], K=33)

100%|███████████████████████████████████████████| 33/33 [00:02<00:00, 12.15it/s]


In [102]:
#export trainging Dataset
export_features = np.append(selected_features, 'label')
export_features

array(['LineFit_TTParams.lf_vel_z', 'HitStatisticsValues.max_pulse_time',
       'SplineMPEFitParams.rlogl', 'HitStatisticsValues.z_travel',
       'SplineMPEDirectHitsA.n_dir_strings', 'LineFit_TT.zenith',
       'NewAtt.DeltaZd', 'MuEXAngular4.zenith', 'NewAtt.SplineVerRadius',
       'SplineMPEDirectHitsA.n_dir_doms', 'MPEFitHighNoise.zenith',
       'MuEXAngular4_Sigma.value', 'SPEFit2_TT.zenith',
       'MPEFit_TTFitParams.rlogl', 'SplineMPE.zenith',
       'SplineMPEDirectHitsC.dir_track_length',
       'SplineMPEMuEXDifferential.zenith',
       'SplineMPETruncatedEnergy_SPICEMie_AllBINS_Muon.zenith',
       'NewAtt.radius', 'SplineMPECharacteristics.avg_dom_dist_q_tot_dom',
       'SplineMPETruncatedEnergy_SPICEMie_AllDOMS_Muon.zenith',
       'MPEFitHighNoiseFitParams.rlogl', 'MPEFit_TT.zenith',
       'MPEFitParaboloid.zenith',
       'SplineMPETruncatedEnergy_SPICEMie_AllBINS_MuEres.value',
       'MPEFitParaboloidFitParams.zenith',
       'SplineMPETruncatedEnergy_SPICEMie_A

In [107]:
df.to_csv('build/training_data.csv', columns = export_features)