# In this notebook we will read in the original ROOT files and convert them to other formats more suitable for ML.
## We will apply a common preselection and split the data into barrel and endcap dataframes, which we will store separately.

### First standard imports

In [2]:
%matplotlib inline
import IPython


### Define location and name of the input ROOT file

In [3]:
filename = '/eos/cms/store/group/phys_egamma/ReleaseInputsArchive/2017UL_ElePhoReg/input_trees/DoubleElectron_FlatPt-1To300_2017ConditionsFlatPU0to70ECALGT_105X_mc2017_realistic_IdealEcalIC_v5-v2_AODSIM_EgRegTreeV5Refined.root'

### Imports to be able to read ROOT, and output 'parquet' and 'feather' files, which are top2 formats for ML

In [4]:
from root_pandas import read_root
from progressbar import ProgressBar
import pandas as pd
import fastparquet
import pyarrow.feather as feather


Welcome to JupyROOT 6.18/00


### Start reading and storing one million events per time. Both formats are used as outputs. One can switch to just one, but extra work is not terribly huge.

In [5]:
%time
# Grab a coffee ... and a donnut! It will take 34 mins with 4 cores and 10/16 gb of memory setup
pbar = ProgressBar()
count = 1
oneMillion=1000000
for df in pbar(read_root(filename, chunksize=oneMillion)):
    print ('count = ', count)
    feather.write_feather(df, 'df_{0}.feather'.format(count))
    df.to_parquet('df_{0}.parquet'.format(count), engine='fastparquet', compression='gzip')
    count +=1

CPU times: user 0 ns, sys: 11 µs, total: 11 µs
Wall time: 22.2 µs


N/A% (0 of 20) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

('count = ', 1)


  5% (1 of 20) |#                        | Elapsed Time: 0:01:48 ETA:   0:34:27

('count = ', 2)


 10% (2 of 20) |##                       | Elapsed Time: 0:03:36 ETA:   0:32:14

('count = ', 3)


 15% (3 of 20) |###                      | Elapsed Time: 0:05:21 ETA:   0:29:54

('count = ', 4)


 20% (4 of 20) |#####                    | Elapsed Time: 0:07:12 ETA:   0:29:28

('count = ', 5)


 25% (5 of 20) |######                   | Elapsed Time: 0:08:57 ETA:   0:26:22

('count = ', 6)


 30% (6 of 20) |#######                  | Elapsed Time: 0:10:47 ETA:   0:25:35

('count = ', 7)


 35% (7 of 20) |########                 | Elapsed Time: 0:12:36 ETA:   0:23:41

('count = ', 8)


 40% (8 of 20) |##########               | Elapsed Time: 0:14:25 ETA:   0:21:39

('count = ', 9)


 45% (9 of 20) |###########              | Elapsed Time: 0:16:11 ETA:   0:19:31

('count = ', 10)


 50% (10 of 20) |############            | Elapsed Time: 0:17:58 ETA:   0:17:44

('count = ', 11)


 55% (11 of 20) |#############           | Elapsed Time: 0:19:46 ETA:   0:16:11

('count = ', 12)


 60% (12 of 20) |##############          | Elapsed Time: 0:21:35 ETA:   0:14:35

('count = ', 13)


 65% (13 of 20) |###############         | Elapsed Time: 0:23:23 ETA:   0:12:34

('count = ', 14)


 70% (14 of 20) |################        | Elapsed Time: 0:25:12 ETA:   0:10:53

('count = ', 15)


 75% (15 of 20) |##################      | Elapsed Time: 0:26:59 ETA:   0:08:58

('count = ', 16)


 80% (16 of 20) |###################     | Elapsed Time: 0:28:51 ETA:   0:07:24

('count = ', 17)


 85% (17 of 20) |####################    | Elapsed Time: 0:30:41 ETA:   0:05:31

('count = ', 18)


 90% (18 of 20) |#####################   | Elapsed Time: 0:32:30 ETA:   0:03:38

('count = ', 19)


 95% (19 of 20) |######################  | Elapsed Time: 0:33:50 ETA:   0:01:19

('count = ', 20)


100% (20 of 20) |########################| Elapsed Time: 0:34:15 Time:  0:34:15


### Look at the columns in the dataframe. Notice, this dataframe is the LAST chunck read from ROOT file and contains a million events or less. 

In [12]:
df.dtypes

nrVert                                int32
rho                                 float32
nrPUInt                             float32
nrPUIntTrue                         float32
evt_runnr                             int32
evt_lumiSec                           int32
evt_eventnr                           int32
sc_rawEnergy                        float32
sc_rawESEnergy                      float32
sc_etaWidth                         float32
sc_phiWidth                         float32
sc_seedClusEnergy                   float32
sc_numberOfClusters                 float32
sc_numberOfSubClusters              float32
sc_clusterMaxDR                     float32
sc_clusterMaxDRDPhi                 float32
sc_clusterMaxDRDEta                 float32
sc_clusterMaxDRRawEnergy            float32
sc_corrEnergy                       float32
sc_scEta                            float32
sc_scPhi                            float32
sc_seedEta                          float32
sc_seedPhi                      

### Define all possible electron variables of interest. 


In [5]:
ele_vars = [
#=== Full electron object ===
'ele_ecalDrivenSeed',
'ele_et',
'ele_fbrem',
'ele_hademTow',
'ele_nrSatCrys',
'ele_trkEtaMode',
'ele_trkPMode',
'ele_trkPModeErr',
'ele_trkPhiMode',

#=== Possibly: for splitting into subsamples if several regressions are chained ===
'evt_eventnr',

#=== MC truth info ===
'mc_energy',

#=== Global branch, pileup ===
'rho',

#=== SuperCluster variables
'sc_dEtaSeedSC',
'sc_dPhiSeedSC',
'sc_etaWidth',
'sc_iEtaMod20',
'sc_iEtaMod5',
'sc_iEtaOrX',
'sc_iPhiMod2',
'sc_iPhiMod20',
'sc_iPhiOrY',
'sc_isEB',
'sc_numberOfClusters',
'sc_phiWidth',
'sc_rawESEnergy',
'sc_rawEnergy',
'sc_seedClusEnergy',

#=== Shower Shapes that are not full 5x5 "ssFrac" (used for selection only) ====
'ssFrac_sigmaIEtaIEta',
'ssFrac_sigmaIPhiIPhi',

#=== Shower Shapes that are full 5x5 "ssFull" ===
'ssFull_e2nd',
'ssFull_e2x5Bottom',
'ssFull_e2x5Left',
'ssFull_e2x5Max',
'ssFull_e2x5Right',
'ssFull_e2x5Top',
'ssFull_e3x3',
'ssFull_e5x5',
'ssFull_eBottom',
'ssFull_eLeft',
'ssFull_eMax',
'ssFull_eRight',
'ssFull_eTop',
'ssFull_sigmaIEtaIEta',
'ssFull_sigmaIEtaIPhi',
'ssFull_sigmaIPhiIPhi'
]



In [6]:
len(ele_vars)

45

### Sanity check of the dataframe. It is optional! Let's look at the variables and print min/max/std and other good indicators. Subselect only variables defined above.

In [14]:
ndf = df[df.columns & ele_vars]
ndf

Unnamed: 0,rho,evt_eventnr,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_seedClusEnergy,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,...,ele_et,ele_trkEtaMode,ele_trkPhiMode,ele_trkPMode,ele_trkPModeErr,ele_fbrem,ele_hademTow,ele_ecalDrivenSeed,ele_nrSatCrys,mc_energy
19000000,0.560577,7483004,100.162697,0.000000,0.010192,0.009672,100.162697,1.0,0.000000e+00,-1.110223e-16,...,101.542236,-0.198346,0.854715,108.501755,5.722579,0.101118,0.000000,1.0,0.0,103.414360
19000001,0.560577,7483004,102.297104,0.000000,0.007167,0.007494,102.297104,1.0,0.000000e+00,0.000000e+00,...,101.419838,0.198721,-2.286952,102.516655,5.853707,-0.021650,0.000000,1.0,0.0,103.414360
19000002,0.000000,7483010,815.802429,30.942381,0.014211,0.015846,814.839050,2.0,-6.217502e-05,1.870113e-04,...,237.282791,1.938216,-0.574147,849.159851,245.585541,0.452575,0.001015,1.0,0.0,849.609924
19000003,0.000000,7483010,825.615601,16.534718,0.012099,0.014092,824.827087,2.0,5.456224e-05,-2.009709e-04,...,236.893997,-1.939229,2.567113,1073.643311,359.965485,0.704514,0.000000,1.0,0.0,849.609924
19000004,2.722099,7483003,127.545967,0.000000,0.007443,0.023278,119.252518,3.0,-4.991862e-05,-5.259997e-03,...,128.327942,0.134032,-2.000583,147.272049,17.983248,0.947268,0.000000,1.0,0.0,130.602478
19000005,2.722099,7483003,129.278946,0.000000,0.008793,0.008621,129.278946,1.0,0.000000e+00,0.000000e+00,...,128.911102,-0.133630,1.140620,128.044632,7.525030,0.025384,0.000000,1.0,0.0,130.602478
19000006,3.313263,7483008,119.501480,0.000000,0.008551,0.008378,119.501480,1.0,0.000000e+00,0.000000e+00,...,120.827484,0.141874,1.226968,117.740089,7.987118,0.173061,0.000000,1.0,0.0,123.540627
19000007,3.313263,7483008,118.483543,0.000000,0.009606,0.012668,118.483543,1.0,0.000000e+00,0.000000e+00,...,119.242516,-0.141794,-1.914681,135.964798,20.876345,0.951372,0.000000,1.0,0.0,123.540627
19000008,5.424200,7483013,479.678101,25.706406,0.021173,0.027684,479.678101,1.0,0.000000e+00,0.000000e+00,...,89.590187,-2.441975,2.608145,158.790802,12.439394,-0.100572,0.002859,1.0,0.0,522.065369
19000009,5.424200,7483013,480.297455,24.934132,0.023404,0.024956,480.297455,1.0,0.000000e+00,0.000000e+00,...,89.187332,2.447175,-0.534207,28.350367,1.785440,0.059055,0.000000,1.0,0.0,522.065369


### Now it is time to combine individual 'parquet' or 'feather' files we already stored in the 'grand' dataframe and store that one to disk under the name 'comb_df'.

In [31]:
%time
list_of_dfs = []
numOfFiles = 20

# Here we are picking just one of the formats to read, but still store BOTH.
nameTemplate = 'df_{0}.parquet'
fileLocation = '/eos/user/r/rkamalie/'
print_read_df = False

for i in range(1,numOfFiles+1):
    print ('Processing {0} out of {1} files'.format(i, numOfFiles))

    fileToProcess = fileLocation + nameTemplate.format(i)
    print 'fileToProcess=', fileToProcess

    if 'feather' in nameTemplate:
        read_df = feather.read_feather(fileToProcess)
    elif 'parquet' in nameTemplate:
        read_df = pd.read_parquet(fileToProcess)
    else:
        print 'This should not happen, nameTemplate is wrong, please check it is in parquet or feather format or that the template correctly describes the existing files, \
exiting...'
        sys.exit(1)

    if print_read_df:
        print read_df.info(memory_usage='deep')
        print '-'*50
        print read_df.describe()

    # x4 reduction in size                                                                                                                                                  
    reduced_df = read_df[read_df.columns & ele_vars]

    if print_read_df:
        print reduced_df.info(memory_usage='deep')
        print '-'*50
        print reduced_df.describe()

    list_of_dfs.append(reduced_df)

print 'Start concatenating dataframes, it may take some time, about 5-6 minutes with memory usage of 3.3 GB'
comb_df = pd.concat(list_of_dfs, ignore_index=True)

print '################################################################################'
print '##################### INFO ABOUT COMBINED DATAFRAME ############################'

print comb_df.info(memory_usage='deep')
print '-'*50
print comb_df.describe()

print '################################## END #########################################'
print '################################################################################'

print 'Write this total dataframe to disk'

print 'Processing feather'
feather.write_feather(comb_df, 'comb_df.feather')

print 'Processing parquet'
comb_df.to_parquet('comb_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

CPU times: user 21 µs, sys: 1 µs, total: 22 µs
Wall time: 36 µs
Processing 1 out of 20 files
fileToProcess= /eos/user/r/rkamalie/df_1.parquet
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 170 entries, nrVert to clus3_clusterDPhiToSeed
dtypes: float32(166), int32(4)
memory usage: 648.5 MB
None
--------------------------------------------------
               nrVert             rho         nrPUInt     nrPUIntTrue  \
count  1000000.000000  1000000.000000  1000000.000000  1000000.000000   
mean        29.260222       20.199434       35.535400       35.539474   
std         18.083026       12.836236       21.336018       20.488903   
min          1.000000        0.000000        0.000000        0.000056   
25%         15.000000        9.444513       17.000000       17.791363   
50%         28.000000       19.689667       35.000000       35.572479   
75%         42.000000       29.889420       53.000000       53.289472   
max        129.000000       7

In [32]:
comb_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414000 entries, 0 to 19413999
Data columns (total 45 columns):
rho                     float32
evt_eventnr             int32
sc_rawEnergy            float32
sc_rawESEnergy          float32
sc_etaWidth             float32
sc_phiWidth             float32
sc_seedClusEnergy       float32
sc_numberOfClusters     float32
sc_dEtaSeedSC           float32
sc_dPhiSeedSC           float32
sc_isEB                 float32
sc_iEtaOrX              float32
sc_iPhiOrY              float32
sc_iEtaMod5             float32
sc_iPhiMod2             float32
sc_iEtaMod20            float32
sc_iPhiMod20            float32
ssFull_e3x3             float32
ssFull_e5x5             float32
ssFull_eMax             float32
ssFull_e2nd             float32
ssFull_sigmaIEtaIEta    float32
ssFull_sigmaIEtaIPhi    float32
ssFull_sigmaIPhiIPhi    float32
ssFull_e2x5Max          float32
ssFull_e2x5Top          float32
ssFull_e2x5Bottom       float32
ssFull_e2x5Left        

## Here are the cuts defined by EGamma. We reimplement them for Pandas. 
Regression.1.CutBase: 
(mc.energy>0 && ssFrac.sigmaIEtaIEta>0 && ssFrac.sigmaIPhiIPhi>0 && ele.et>0 && evt.eventnr%10==0)                                               


In [43]:
mydf = comb_df[(comb_df.mc_energy>0) & (comb_df.ssFrac_sigmaIEtaIEta>0) & (comb_df.ssFrac_sigmaIPhiIPhi>0) & (comb_df.ele_et>0)]
mydf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14985168 entries, 0 to 19413999
Data columns (total 45 columns):
rho                     float32
evt_eventnr             int32
sc_rawEnergy            float32
sc_rawESEnergy          float32
sc_etaWidth             float32
sc_phiWidth             float32
sc_seedClusEnergy       float32
sc_numberOfClusters     float32
sc_dEtaSeedSC           float32
sc_dPhiSeedSC           float32
sc_isEB                 float32
sc_iEtaOrX              float32
sc_iPhiOrY              float32
sc_iEtaMod5             float32
sc_iPhiMod2             float32
sc_iEtaMod20            float32
sc_iPhiMod20            float32
ssFull_e3x3             float32
ssFull_e5x5             float32
ssFull_eMax             float32
ssFull_e2nd             float32
ssFull_sigmaIEtaIEta    float32
ssFull_sigmaIEtaIPhi    float32
ssFull_sigmaIPhiIPhi    float32
ssFull_e2x5Max          float32
ssFull_e2x5Top          float32
ssFull_e2x5Bottom       float32
ssFull_e2x5Left        

### Save now preselected 'grand' dataframe.

In [48]:
%time
print 'Processing feather'
feather.write_feather(mydf, 'preselected_comb_df.feather')

print 'Processing parquet'
mydf.to_parquet('preselected_comb_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

Processing feather
Processing parquet
All done!


### Define usefull function to measure the RAM taking by the notebook.

In [122]:
#memory usage and release 
#https://stackoverflow.com/questions/39100971/how-do-i-release-memory-used-by-a-pandas-dataframe

import os, psutil, numpy as np
import gc
    
def usage():
    process = psutil.Process(os.getpid())
    return process.memory_info()[0] / float(2 ** 20)

def garbageCollect():
    gc.collect()

In [5]:
from notebook.services.config import ConfigManager
ConfigManager().update('notebook', {'ExecuteTime': {
    'display_absolute_timestamps': False,
    'relative_timing_update_period': 5,
    'template': {
        'executed': 'started ${start_time}, finished in ${duration}',
    }
}})


{u'ExecuteTime': {u'display_absolute_timestamps': False,
  u'relative_timing_update_period': 5,
  u'template': {u'executed': 'started ${start_time}, finished in ${duration}'}},
 u'load_extensions': {u'jupyter-js-widgets/extension': True,
  u'rise/main': True}}

### Some usefull commands for notebooks
https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts


### If anywhere by chance the system dies but preselected dataframe is stored, here is how to read it back for 'parquet':

In [29]:
import pandas as pd
#%timeit 
mydf = pd.read_parquet('preselected_comb_df.parquet')
# takes about 20 seconds


### And here is how to read it for 'feather' format.

In [123]:
import pandas as pd
import pyarrow.feather as feather
#%timeit 
mydf = feather.read_feather('preselected_comb_df.feather')
#takes 5 to 55 seconds
### PREFER feather SINCE IT REINDEXES

In [124]:
usage()

3551.95703125

### If the namespace is full (output of usage() is close to the notebook limit (10-16 Gb depending on setting you chose)), you may want to check what exactly occupies the napespace. For that, use 'whos' command. Then, you can delete intidivual items using 'del nameOfObject' command.

In [48]:
#del mydf

0

In [7]:
usage()

2793.13671875

In [6]:
mydf.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14985168 entries, 0 to 14985167
Data columns (total 45 columns):
rho                     float32
evt_eventnr             int32
sc_rawEnergy            float32
sc_rawESEnergy          float32
sc_etaWidth             float32
sc_phiWidth             float32
sc_seedClusEnergy       float32
sc_numberOfClusters     float32
sc_dEtaSeedSC           float32
sc_dPhiSeedSC           float32
sc_isEB                 float32
sc_iEtaOrX              float32
sc_iPhiOrY              float32
sc_iEtaMod5             float32
sc_iPhiMod2             float32
sc_iEtaMod20            float32
sc_iPhiMod20            float32
ssFull_e3x3             float32
ssFull_e5x5             float32
ssFull_eMax             float32
ssFull_e2nd             float32
ssFull_sigmaIEtaIEta    float32
ssFull_sigmaIEtaIPhi    float32
ssFull_sigmaIPhiIPhi    float32
ssFull_e2x5Max          float32
ssFull_e2x5Top          float32
ssFull_e2x5Bottom       float32
ssFull_e2x5Left        

### Settings to force the dataframe display all columns and not the truncted version.

In [125]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
mydf

Unnamed: 0,rho,evt_eventnr,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_seedClusEnergy,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_isEB,sc_iEtaOrX,sc_iPhiOrY,sc_iEtaMod5,sc_iPhiMod2,sc_iEtaMod20,sc_iPhiMod20,ssFull_e3x3,ssFull_e5x5,ssFull_eMax,ssFull_e2nd,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ssFull_e2x5Max,ssFull_e2x5Top,ssFull_e2x5Bottom,ssFull_e2x5Left,ssFull_e2x5Right,ssFull_eTop,ssFull_eBottom,ssFull_eLeft,ssFull_eRight,ssFrac_sigmaIEtaIEta,ssFrac_sigmaIPhiIPhi,ele_et,ele_trkEtaMode,ele_trkPhiMode,ele_trkPMode,ele_trkPModeErr,ele_fbrem,ele_hademTow,ele_ecalDrivenSeed,ele_nrSatCrys,mc_energy
0,10.558399,9136010,96.798744,0.000000,0.006568,0.031675,70.552330,4.0,-5.031547e-04,-2.415152e-03,1.0,32.0,256.0,1.0,1.0,6.0,15.0,70.489532,84.388367,48.542126,9.435896,0.007908,0.065228,0.017998,79.950562,22.733250,7.897033,3.577513,8.699830,9.435896,3.456726,1.023364,3.785786,0.009104,0.013606,86.204430,0.523132,-2.002805,78.652199,7.454437,0.867825,0.000000,1.0,0.0,99.399963
1,10.558399,9136010,97.621788,0.000000,0.009281,0.010019,97.621788,1.0,0.000000e+00,0.000000e+00,1.0,-28.0,75.0,-2.0,0.0,-2.0,14.0,93.421600,98.287766,31.366396,29.315327,0.008893,-0.100080,0.009975,94.656113,52.960716,1.944843,1.965344,33.932362,29.315327,0.594810,0.466266,10.753372,0.008869,0.009995,87.683220,-0.522876,1.139411,97.622147,4.116127,0.316007,0.000000,1.0,0.0,99.399963
2,12.537903,9136005,221.094452,0.000000,0.008480,0.012603,220.320511,2.0,3.056435e-05,-5.005577e-04,1.0,-21.0,101.0,0.0,0.0,0.0,0.0,211.077713,219.042786,167.985031,14.997272,0.009775,0.009249,0.010154,206.873917,16.643719,15.036918,25.694096,9.460797,7.501907,6.679216,14.997272,3.580314,0.009775,0.010154,206.245773,-0.370815,1.573278,236.226303,23.031010,0.301949,0.000000,1.0,0.0,223.497986
3,12.537903,9136005,217.442139,0.000000,0.010157,0.010345,217.442139,1.0,0.000000e+00,0.000000e+00,1.0,22.0,280.0,1.0,1.0,1.0,19.0,206.086761,216.032410,131.924179,36.406002,0.009165,0.084857,0.010268,205.479294,34.932705,9.332662,57.994270,5.643030,16.092178,2.854428,36.406002,1.705848,0.009156,0.010272,209.821930,0.370779,-1.568233,199.691940,16.695520,-0.021369,0.003535,1.0,0.0,223.497986
4,12.807579,9136002,189.944199,0.000000,0.006963,0.030441,181.715485,5.0,6.359640e-04,-1.577192e-04,1.0,-53.0,9.0,-2.0,0.0,-7.0,8.0,166.529587,181.445374,84.001167,60.661690,0.008288,0.046061,0.014327,169.458038,9.336192,79.387581,11.480898,10.961783,4.525568,60.661690,4.541782,3.556031,0.008307,0.014326,131.120331,-0.933664,-0.028918,195.827820,28.702194,0.564550,0.000000,1.0,0.0,193.680542
5,5.446676,9136017,12.729020,0.000000,0.007266,0.006614,12.729020,1.0,0.000000e+00,-1.110223e-16,1.0,2.0,49.0,1.0,0.0,1.0,8.0,13.188349,13.515256,8.817220,1.665064,0.009273,-0.146166,0.009757,13.133986,0.427279,2.535194,2.791915,0.268336,0.139834,1.497952,1.665064,0.070499,0.009270,0.009264,14.025374,0.025887,0.613671,14.110765,0.351522,0.005716,0.000000,1.0,0.0,14.144497
6,5.446676,9136017,12.650763,0.000000,0.006147,0.008270,12.650763,1.0,0.000000e+00,0.000000e+00,1.0,-1.0,223.0,0.0,0.0,0.0,2.0,12.817404,13.760635,7.355963,4.009751,0.010460,0.085742,0.009192,12.891888,0.475759,4.883085,0.599375,1.558402,0.099447,4.009751,0.244135,0.648014,0.009629,0.008696,14.108038,-0.024787,-2.528021,14.463190,0.449630,0.028391,0.000000,1.0,0.0,14.144497
7,17.500393,9136008,107.924393,8.423841,0.012864,0.047712,82.378540,3.0,-2.167862e-03,6.474053e-03,0.0,20.0,27.0,0.0,0.0,0.0,0.0,82.253563,103.202377,51.282570,13.834238,0.021894,-0.061254,0.055772,84.722588,27.398100,8.006230,36.951664,7.822662,4.353574,1.782115,13.834238,2.012559,0.026104,0.032817,42.754639,1.740872,-2.509806,20.668043,2.414800,0.691514,0.000000,1.0,0.0,128.248749
8,17.500393,9136008,108.024261,11.480968,0.014954,0.044356,102.896591,3.0,1.522460e-03,-3.945758e-03,0.0,82.0,74.0,0.0,0.0,0.0,0.0,95.537163,102.174324,44.451820,31.989223,0.022739,0.035947,0.037725,94.638268,8.885093,14.859746,4.069554,45.262173,4.570642,2.954343,1.382553,31.989223,0.022790,0.037806,43.773342,-1.740667,0.638841,643.413513,239.155563,0.877622,0.000000,1.0,0.0,128.248749
9,18.757620,9136003,432.131531,0.000000,0.010091,0.016258,412.811798,3.0,2.974516e-04,7.208383e-04,1.0,-57.0,341.0,-1.0,0.0,-11.0,0.0,397.683014,415.426453,196.116516,112.017899,0.008957,0.066931,0.010391,397.062347,80.738564,20.129625,147.520401,13.164332,45.779739,7.659820,112.017899,4.063671,0.008972,0.010357,282.319489,-0.994725,-0.515282,413.268951,36.629845,0.197359,0.000000,1.0,0.0,434.072479


In [9]:
mydf.describe()

Unnamed: 0,rho,evt_eventnr,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_seedClusEnergy,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_isEB,sc_iEtaOrX,sc_iPhiOrY,sc_iEtaMod5,sc_iPhiMod2,sc_iEtaMod20,sc_iPhiMod20,ssFull_e3x3,ssFull_e5x5,ssFull_eMax,ssFull_e2nd,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ssFull_e2x5Max,ssFull_e2x5Top,ssFull_e2x5Bottom,ssFull_e2x5Left,ssFull_e2x5Right,ssFull_eTop,ssFull_eBottom,ssFull_eLeft,ssFull_eRight,ssFrac_sigmaIEtaIEta,ssFrac_sigmaIPhiIPhi,ele_et,ele_trkEtaMode,ele_trkPhiMode,ele_trkPMode,ele_trkPModeErr,ele_fbrem,ele_hademTow,ele_ecalDrivenSeed,ele_nrSatCrys,mc_energy
count,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985167.0,14985170.0
mean,19.97382,5046470.0,377.9257,7.335956,0.01193437,0.02406599,372.8392,2.432788,-2.997636e-06,-5.677242e-06,0.5742211,21.45832,125.2997,0.0005020965,0.2873538,-0.004670685,5.463315,358.1909,374.4382,235.0957,63.51256,0.01546967,-0.01061414,0.02014904,354.316,46.13595,46.04879,40.64065,40.82364,26.64194,26.61393,22.2741,22.36086,0.01533254,0.01929471,151.9284,-0.01247946,-0.01745397,761.7798,31683710.0,0.4421127,0.0080372,0.9691119,0.0,396.8549
std,12.76845,2889091.0,347.8749,11.94761,0.00567293,0.01585948,348.6078,1.79988,0.001345036,0.006332732,0.4944606,47.01327,103.0828,1.846125,0.452528,8.109182,6.444108,338.8517,348.8895,247.0245,72.59937,0.0078744,0.1717326,0.01077849,332.0492,63.96088,63.91234,63.51047,63.54219,45.71831,45.69967,44.54048,44.53814,0.007624125,0.009973876,84.70121,1.502251,1.803607,247191.5,107308700000.0,2.442862,0.4315716,0.1730147,0.0,359.2299
min,0.0,1.0,0.2808963,0.0,8.671309e-05,1.403324e-06,0.2808963,1.0,-0.2552902,-0.3811861,0.0,-85.0,1.0,-4.0,0.0,-19.0,0.0,0.4548348,0.4548348,0.2317474,0.1494172,0.0,-1.0,0.0002690318,0.4298494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.035868e-10,2.194893e-09,0.02162664,-4.842429,-3.141592,0.03589295,1.568661e-05,-6814.325,0.0,0.0,0.0,1.120957
25%,9.258972,2537200.0,136.1745,0.0,0.007763034,0.01356573,131.2442,1.0,-7.104153e-05,-0.0003646433,0.0,-11.0,44.0,0.0,0.0,-2.0,0.0,125.0329,133.5008,71.57972,18.11228,0.008913856,-0.09280901,0.01086688,125.6941,10.85857,10.7941,7.24964,7.366873,4.211712,4.200323,2.424657,2.471955,0.008932089,0.0107351,78.68557,-1.294079,-1.580388,109.8301,9.068682,0.2002993,0.0,1.0,0.0,146.5395
50%,19.42506,5055925.0,268.4621,0.0,0.009724634,0.02099512,263.9473,2.0,0.0,0.0,1.0,32.0,78.0,0.0,0.0,0.0,2.0,251.1379,264.4055,153.615,40.05738,0.009519991,-0.005214928,0.01623179,250.0212,25.02279,24.96968,18.39296,18.61379,10.96402,10.95723,7.024039,7.125227,0.009511311,0.01480434,151.5566,-0.01502267,-0.02160997,230.6579,28.05947,0.4727513,0.0,1.0,0.0,280.2458
75%,29.59096,7559198.0,498.5067,13.03652,0.01511821,0.02878841,489.9037,3.0,6.925964e-05,0.0003558224,1.0,60.0,206.0,0.0,1.0,2.0,11.0,466.8564,490.3376,295.7281,80.4534,0.02354654,0.0734315,0.02625505,462.5998,54.46799,54.38106,45.80825,46.04439,28.81553,28.78906,21.09272,21.23075,0.02352336,0.02604066,225.0044,1.268709,1.534383,431.1,101.3056,0.7424718,0.002412337,1.0,0.0,527.8936
max,91.60725,10000000.0,2480.208,121.029,0.3414597,0.8972858,2480.208,54.0,0.2174096,0.3975364,1.0,100.0,360.0,4.0,1.0,19.0,19.0,2462.348,2505.503,2198.231,994.5407,0.08180051,1.0,0.09596091,2442.049,1868.554,1415.472,1361.06,1957.41,1023.464,1223.858,1010.967,860.7517,0.06799459,0.08522816,2055.027,5.28652,3.141592,912486700.0,414983600000000.0,0.9998899,240.4923,1.0,0.0,2695.155


In [126]:
#Some outlier?!
mydf['ssFull_e5x5'].min()

-7.665816

In [None]:
import matplotlib as plt
df = pd.DataFrame()
df['ssPositive'] = mydf.ssFull_e5x5.where(mydf.ssFull_e5x5 >= 0)
df['ssNegative'] = mydf.ssFull_e5x5.where(mydf.ssFull_e5x5 < 0)
df

In [15]:
df[df['ssNegative'].notnull()]

Unnamed: 0,ssPositive,ssNegative
356084,,-7.665816


In [11]:
usage()

2971.625

In [127]:
#remove one single outlier
mydf = mydf[mydf.ssFull_e5x5>0]


In [7]:
mydf.describe()

Unnamed: 0,rho,evt_eventnr,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_seedClusEnergy,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,...,ele_et,ele_trkEtaMode,ele_trkPhiMode,ele_trkPMode,ele_trkPModeErr,ele_fbrem,ele_hademTow,ele_ecalDrivenSeed,ele_nrSatCrys,mc_energy
count,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,...,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985167.0,14985170.0
mean,19.97382,5046470.0,377.9257,7.335956,0.01193437,0.02406599,372.8392,2.432788,-2.997636e-06,-5.677242e-06,...,151.9284,-0.01247946,-0.01745397,761.7798,31683710.0,0.4421127,0.0080372,0.9691119,0.0,396.8549
std,12.76845,2889091.0,347.8749,11.94761,0.00567293,0.01585948,348.6078,1.79988,0.001345036,0.006332732,...,84.70121,1.502251,1.803607,247191.5,107308700000.0,2.442862,0.4315716,0.1730147,0.0,359.2299
min,0.0,1.0,0.2808963,0.0,8.671309e-05,1.403324e-06,0.2808963,1.0,-0.2552902,-0.3811861,...,0.02162664,-4.842429,-3.141592,0.03589295,1.568661e-05,-6814.325,0.0,0.0,0.0,1.120957
25%,9.258972,2537200.0,136.1745,0.0,0.007763034,0.01356573,131.2442,1.0,-7.104153e-05,-0.0003646433,...,78.68557,-1.294079,-1.580388,109.8301,9.068682,0.2002993,0.0,1.0,0.0,146.5395
50%,19.42506,5055925.0,268.4621,0.0,0.009724634,0.02099512,263.9473,2.0,0.0,0.0,...,151.5566,-0.01502267,-0.02160997,230.6579,28.05947,0.4727513,0.0,1.0,0.0,280.2458
75%,29.59096,7559198.0,498.5067,13.03652,0.01511821,0.02878841,489.9037,3.0,6.925964e-05,0.0003558224,...,225.0044,1.268709,1.534383,431.1,101.3056,0.7424718,0.002412337,1.0,0.0,527.8936
max,91.60725,10000000.0,2480.208,121.029,0.3414597,0.8972858,2480.208,54.0,0.2174096,0.3975364,...,2055.027,5.28652,3.141592,912486700.0,414983600000000.0,0.9998899,240.4923,1.0,0.0,2695.155


### Define lists of barrel and endcap variables for training, also add to them 4 additional important variables.

In [157]:
barrel_vars = [
'mc_energy',
'sc_rawESEnergy',
    
'meas_energy',
'Emc_over_Emeas',

    
'sc_rawEnergy',
'sc_etaWidth',
'sc_phiWidth',
'sc_seedClusEnergy_over_sc_rawEnergy',
'ssFull_e5x5_over_sc_rawEnergy',
'ele_hademTow',
'rho',
'sc_dEtaSeedSC',
'sc_dPhiSeedSC',
'ssFull_e3x3_over_sc_rawEnergy',
'ssFull_sigmaIEtaIEta',
'ssFull_sigmaIEtaIPhi',
'ssFull_sigmaIPhiIPhi',
'ssFull_eMax_over_ssFull_e5x5',
'ssFull_e2nd_over_ssFull_e5x5',
'ssFull_eTop_over_ssFull_e5x5',
'ssFull_eBottom_over_ssFull_e5x5',
'ssFull_eLeft_over_ssFull_e5x5',
'ssFull_eRight_over_ssFull_e5x5',
'ssFull_e2x5Max_over_ssFull_e5x5',
'ssFull_e2x5Left_over_ssFull_e5x5',
'ssFull_e2x5Right_over_ssFull_e5x5',
'ssFull_e2x5Top_over_ssFull_e5x5',
'ssFull_e2x5Bottom_over_ssFull_e5x5',
'ele_nrSatCrys',
'sc_numberOfClusters',
'sc_iEtaOrX',
'sc_iPhiOrY',
'sc_iEtaMod5',
'sc_iPhiMod2',
'sc_iEtaMod20',
'sc_iPhiMod20'
]
len(barrel_vars)

36

In [158]:
endcap_vars = [
'mc_energy',
'sc_rawESEnergy',

'meas_energy',
'Emc_over_Emeas',
    
'sc_rawEnergy',
'sc_etaWidth',
'sc_phiWidth',
'sc_seedClusEnergy_over_sc_rawEnergy',
'ssFull_e5x5_over_sc_rawEnergy',
'ele_hademTow',
'rho',
'sc_dEtaSeedSC',
'sc_dPhiSeedSC',
'ssFull_e3x3_over_sc_rawEnergy',
'ssFull_sigmaIEtaIEta',
'ssFull_sigmaIEtaIPhi',
'ssFull_sigmaIPhiIPhi',
'ssFull_eMax_over_ssFull_e5x5',
'ssFull_e2nd_over_ssFull_e5x5',
'ssFull_eTop_over_ssFull_e5x5',
'ssFull_eBottom_over_ssFull_e5x5',
'ssFull_eLeft_over_ssFull_e5x5',
'ssFull_eRight_over_ssFull_e5x5',
'ssFull_e2x5Max_over_ssFull_e5x5',
'ssFull_e2x5Left_over_ssFull_e5x5',
'ssFull_e2x5Right_over_ssFull_e5x5',
'ssFull_e2x5Top_over_ssFull_e5x5',
'ssFull_e2x5Bottom_over_ssFull_e5x5',
'ele_nrSatCrys',
'sc_numberOfClusters',
'sc_iEtaOrX',
'sc_iPhiOrY',
'sc_rawESEnergy_over_sc_rawEnergy'
    
]
len(endcap_vars)

33

### Create exact version of variables to be used in the training.

In [130]:
mydf['sc_seedClusEnergy_over_sc_rawEnergy'] = mydf['sc_seedClusEnergy']/mydf['sc_rawEnergy']
mydf['ssFull_e5x5_over_sc_rawEnergy'] = mydf['ssFull_e5x5']/mydf['sc_rawEnergy']

mydf['ssFull_e3x3_over_sc_rawEnergy'] = mydf['ssFull_e3x3']/mydf['sc_rawEnergy']

mydf['ssFull_eMax_over_ssFull_e5x5'] = mydf['ssFull_eMax']/mydf['ssFull_e5x5']
mydf['ssFull_e2nd_over_ssFull_e5x5'] = mydf['ssFull_e2nd']/mydf['ssFull_e5x5']
mydf['ssFull_eTop_over_ssFull_e5x5'] = mydf['ssFull_eTop']/mydf['ssFull_e5x5']
mydf['ssFull_eBottom_over_ssFull_e5x5'] = mydf['ssFull_eBottom']/mydf['ssFull_e5x5']
mydf['ssFull_eLeft_over_ssFull_e5x5'] = mydf['ssFull_eLeft']/mydf['ssFull_e5x5']
mydf['ssFull_eRight_over_ssFull_e5x5'] = mydf['ssFull_eRight']/mydf['ssFull_e5x5']

mydf['ssFull_e2x5Max_over_ssFull_e5x5'] = mydf['ssFull_e2x5Max']/mydf['ssFull_e5x5']
mydf['ssFull_e2x5Left_over_ssFull_e5x5'] = mydf['ssFull_e2x5Left']/mydf['ssFull_e5x5']
mydf['ssFull_e2x5Right_over_ssFull_e5x5'] = mydf['ssFull_e2x5Right']/mydf['ssFull_e5x5']
mydf['ssFull_e2x5Top_over_ssFull_e5x5'] = mydf['ssFull_e2x5Top']/mydf['ssFull_e5x5']
mydf['ssFull_e2x5Bottom_over_ssFull_e5x5'] = mydf['ssFull_e2x5Bottom']/mydf['ssFull_e5x5']

mydf['sc_rawESEnergy_over_sc_rawEnergy'] = mydf['sc_rawESEnergy']/mydf['sc_rawEnergy']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexi

In [131]:
usage()

6697.32421875

In [16]:
mydf.describe()

Unnamed: 0,rho,evt_eventnr,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_seedClusEnergy,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_isEB,sc_iEtaOrX,sc_iPhiOrY,sc_iEtaMod5,sc_iPhiMod2,sc_iEtaMod20,sc_iPhiMod20,ssFull_e3x3,ssFull_e5x5,ssFull_eMax,ssFull_e2nd,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ssFull_e2x5Max,ssFull_e2x5Top,ssFull_e2x5Bottom,ssFull_e2x5Left,ssFull_e2x5Right,ssFull_eTop,ssFull_eBottom,ssFull_eLeft,ssFull_eRight,ssFrac_sigmaIEtaIEta,ssFrac_sigmaIPhiIPhi,ele_et,ele_trkEtaMode,ele_trkPhiMode,ele_trkPMode,ele_trkPModeErr,ele_fbrem,ele_hademTow,ele_ecalDrivenSeed,ele_nrSatCrys,mc_energy,sc_seedClusEnergy_over_sc_rawEnergy,ssFull_e5x5_over_sc_rawEnergy,ssFull_e3x3_over_sc_rawEnergy,ssFull_eMax_over_ssFull_e5x5,ssFull_e2nd_over_ssFull_e5x5,ssFull_eTop_over_ssFull_e5x5,ssFull_eBottom_over_ssFull_e5x5,ssFull_eLeft_over_ssFull_e5x5,ssFull_eRight_over_ssFull_e5x5,ssFull_e2x5Max_over_ssFull_e5x5,ssFull_e2x5Left_over_ssFull_e5x5,ssFull_e2x5Right_over_ssFull_e5x5,ssFull_e2x5Top_over_ssFull_e5x5,ssFull_e2x5Bottom_over_ssFull_e5x5,sc_rawESEnergy_over_sc_rawEnergy
count,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985167.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0,14985170.0
mean,19.97382,5046470.0,377.9257,7.335956,0.01193437,0.02406599,372.8392,2.432788,-2.997636e-06,-5.677242e-06,0.5742211,21.45832,125.2997,0.0005020965,0.2873538,-0.004670685,5.463315,358.1909,374.4382,235.0957,63.51256,0.01546967,-0.01061414,0.02014904,354.316,46.13595,46.04879,40.64065,40.82364,26.64194,26.61393,22.2741,22.36086,0.01533254,0.01929471,151.9284,-0.01247946,-0.01745397,761.7798,31683710.0,0.4421127,0.0080372,0.9691119,0.0,396.8549,0.9650916,0.983199,0.9267472,0.5929695,0.1795003,0.08042685,0.08038022,0.05788452,0.05825862,0.9430997,0.1122208,0.113011,0.1419773,0.1416803,0.01761959
std,12.76845,2889091.0,347.8749,11.94761,0.00567293,0.01585948,348.6078,1.79988,0.001345036,0.006332732,0.4944606,47.01327,103.0828,1.846125,0.452528,8.109182,6.444108,338.8517,348.8895,247.0245,72.59937,0.0078744,0.1717326,0.01077849,332.0492,63.96088,63.91234,63.51047,63.54219,45.71831,45.69967,44.54048,44.53814,0.007624125,0.009973876,84.70121,1.502251,1.803607,247191.5,107308700000.0,2.442862,0.4315716,0.1730147,0.0,359.2299,0.08551732,0.4866453,0.3177953,0.1430795,0.09528822,0.08948058,0.08948939,0.07630977,0.07633292,0.03474898,0.1124365,0.1125588,0.1239299,0.1238616,0.03628573
min,0.0,1.0,0.2808963,0.0,8.671309e-05,1.403324e-06,0.2808963,1.0,-0.2552902,-0.3811861,0.0,-85.0,1.0,-4.0,0.0,-19.0,0.0,0.4548348,0.4548348,0.2317474,0.1494172,0.0,-1.0,0.0002690318,0.4298494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.035868e-10,2.194893e-09,0.02162664,-4.842429,-3.141592,0.03589295,1.568661e-05,-6814.325,0.0,0.0,0.0,1.120957,0.1644615,0.09955423,0.09955423,0.001882909,0.001653227,0.0,0.0,0.0,0.0,0.03797474,0.0,0.0,0.0,0.0,0.0
25%,9.258972,2537200.0,136.1745,0.0,0.007763034,0.01356573,131.2442,1.0,-7.104153e-05,-0.0003646433,0.0,-11.0,44.0,0.0,0.0,-2.0,0.0,125.0329,133.5008,71.57972,18.11228,0.008913856,-0.09280901,0.01086688,125.6941,10.85857,10.7941,7.24964,7.366873,4.211712,4.200323,2.424657,2.471955,0.008932089,0.0107351,78.68557,-1.294079,-1.580388,109.8301,9.068682,0.2002993,0.0,1.0,0.0,146.5395,0.9744997,0.9780469,0.9184567,0.4803678,0.09889187,0.01808485,0.01803507,0.01199475,0.01217445,0.9396445,0.0365727,0.03704876,0.04880221,0.04857456,0.0
50%,19.42506,5055925.0,268.4621,0.0,0.009724634,0.02099512,263.9473,2.0,0.0,0.0,1.0,32.0,78.0,0.0,0.0,0.0,2.0,251.1379,264.4055,153.615,40.05738,0.009519991,-0.005214928,0.01623179,250.0212,25.02279,24.96968,18.39296,18.61379,10.96402,10.95723,7.024039,7.125227,0.009511311,0.01480434,151.5566,-0.01502267,-0.02160997,230.6579,28.05947,0.4727513,0.0,1.0,0.0,280.2458,0.9968311,0.9946958,0.9534339,0.6083757,0.1649868,0.04161005,0.04157624,0.02499475,0.02542705,0.9509999,0.0657796,0.06670315,0.09332652,0.09306363,0.0
75%,29.59096,7559198.0,498.5067,13.03652,0.01511821,0.02878841,489.9037,3.0,6.925964e-05,0.0003558224,1.0,60.0,206.0,0.0,1.0,2.0,11.0,466.8564,490.3376,295.7281,80.4534,0.02354654,0.0734315,0.02625505,462.5998,54.46799,54.38106,45.80825,46.04439,28.81553,28.78906,21.09272,21.23075,0.02352336,0.02604066,225.0044,1.268709,1.534383,431.1,101.3056,0.7424718,0.002412337,1.0,0.0,527.8936,1.0,1.002112,0.9667876,0.7152542,0.2485742,0.10948,0.109428,0.06724346,0.0679663,0.9584683,0.1421103,0.1433196,0.197673,0.1972642,0.02524946
max,91.60725,10000000.0,2480.208,121.029,0.3414597,0.8972858,2480.208,54.0,0.2174096,0.3975364,1.0,100.0,360.0,4.0,1.0,19.0,19.0,2462.348,2505.503,2198.231,994.5407,0.08180051,1.0,0.09596091,2442.049,1868.554,1415.472,1361.06,1957.41,1023.464,1223.858,1010.967,860.7517,0.06799459,0.08522816,2055.027,5.28652,3.141592,912486700.0,414983600000000.0,0.9998899,240.4923,1.0,0.0,2695.155,1.0,600.6783,278.2103,0.9718506,0.8007877,0.798232,0.8201018,0.813609,0.7983075,1.0,0.9856412,0.9903917,0.9904305,0.9856783,5.267301


In [132]:
mydf.isnull().values.any()

False

### Define measured energy as a sum of ECAL and Preshower energies, also define a target ratio of energies as MC true energy over the measured energy

In [133]:
mydf['meas_energy'] = (mydf['sc_rawEnergy']+mydf['sc_rawESEnergy'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [134]:
mydf['meas_energy'].min() 

0.2808963

In [135]:
mydf['Emc_over_Emeas'] = mydf['mc_energy']/mydf['meas_energy']
mydf['Emc_over_Emeas'].min()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.022844527

In [136]:
mydf['mc_energy'].min() 


1.120957

In [137]:
mydf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14985167 entries, 0 to 14985167
Data columns (total 62 columns):
rho                                    float32
evt_eventnr                            int32
sc_rawEnergy                           float32
sc_rawESEnergy                         float32
sc_etaWidth                            float32
sc_phiWidth                            float32
sc_seedClusEnergy                      float32
sc_numberOfClusters                    float32
sc_dEtaSeedSC                          float32
sc_dPhiSeedSC                          float32
sc_isEB                                float32
sc_iEtaOrX                             float32
sc_iPhiOrY                             float32
sc_iEtaMod5                            float32
sc_iPhiMod2                            float32
sc_iEtaMod20                           float32
sc_iPhiMod20                           float32
ssFull_e3x3                            float32
ssFull_e5x5                            fl

In [159]:
usage()

13566.8359375

In [160]:
len(barrel_vars)

36

In [161]:
barrel_vars

['mc_energy',
 'sc_rawESEnergy',
 'meas_energy',
 'Emc_over_Emeas',
 'sc_rawEnergy',
 'sc_etaWidth',
 'sc_phiWidth',
 'sc_seedClusEnergy_over_sc_rawEnergy',
 'ssFull_e5x5_over_sc_rawEnergy',
 'ele_hademTow',
 'rho',
 'sc_dEtaSeedSC',
 'sc_dPhiSeedSC',
 'ssFull_e3x3_over_sc_rawEnergy',
 'ssFull_sigmaIEtaIEta',
 'ssFull_sigmaIEtaIPhi',
 'ssFull_sigmaIPhiIPhi',
 'ssFull_eMax_over_ssFull_e5x5',
 'ssFull_e2nd_over_ssFull_e5x5',
 'ssFull_eTop_over_ssFull_e5x5',
 'ssFull_eBottom_over_ssFull_e5x5',
 'ssFull_eLeft_over_ssFull_e5x5',
 'ssFull_eRight_over_ssFull_e5x5',
 'ssFull_e2x5Max_over_ssFull_e5x5',
 'ssFull_e2x5Left_over_ssFull_e5x5',
 'ssFull_e2x5Right_over_ssFull_e5x5',
 'ssFull_e2x5Top_over_ssFull_e5x5',
 'ssFull_e2x5Bottom_over_ssFull_e5x5',
 'ele_nrSatCrys',
 'sc_numberOfClusters',
 'sc_iEtaOrX',
 'sc_iPhiOrY',
 'sc_iEtaMod5',
 'sc_iPhiMod2',
 'sc_iEtaMod20',
 'sc_iPhiMod20']

In [162]:
len(endcap_vars)

33

### Subset the dataframe and keep only barrel information.

In [143]:
barrel_df = mydf[(mydf.sc_isEB==1)]

In [144]:
barrel_df.sc_isEB.min()

1.0

In [146]:
barrel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8604800 entries, 0 to 14985167
Data columns (total 62 columns):
rho                                    float32
evt_eventnr                            int32
sc_rawEnergy                           float32
sc_rawESEnergy                         float32
sc_etaWidth                            float32
sc_phiWidth                            float32
sc_seedClusEnergy                      float32
sc_numberOfClusters                    float32
sc_dEtaSeedSC                          float32
sc_dPhiSeedSC                          float32
sc_isEB                                float32
sc_iEtaOrX                             float32
sc_iPhiOrY                             float32
sc_iEtaMod5                            float32
sc_iPhiMod2                            float32
sc_iEtaMod20                           float32
sc_iPhiMod20                           float32
ssFull_e3x3                            float32
ssFull_e5x5                            flo

### Store a 'big' barrel dataframe. Contains additional variables which may be needed for cross checking results. Already after preselection and is easy to load in.

In [148]:
barrel_df.columns = barrel_df.columns.astype(str)
print 'Write this big barrel dataframe to disk'

print 'Processing feather'
feather.write_feather(barrel_df, 'big_barrel_df.feather')

print 'Processing parquet'
barrel_df.to_parquet('big_barrel_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

Write this big barrel dataframe to disk
Processing feather
Processing parquet
All done!


In [48]:
list(set(mydf.columns).symmetric_difference(set(barrel_vars)))

[u'ssFrac_sigmaIEtaIEta',
 u'ele_trkPhiMode',
 u'ssFull_eBottom',
 'iEtaMod20_sc',
 'meas_energy',
 u'ele_fbrem',
 u'mc_energy',
 u'ele_trkPModeErr',
 u'ssFull_eMax',
 u'ele_trkEtaMode',
 u'ssFull_e2x5Right',
 'iEtaMod5_sc',
 u'ele_trkPMode',
 'iPhiOrY_sc',
 u'ssFull_e2x5Max',
 u'ssFull_eLeft',
 u'ssFull_e2x5Top',
 'sc_rawESEnergy_over_sc_rawEnergy',
 u'sc_iPhiOrY',
 u'ssFull_e2x5Bottom',
 u'ssFrac_sigmaIPhiIPhi',
 u'ssFull_e2x5Left',
 u'ssFull_e5x5',
 u'ssFull_e2nd',
 u'evt_eventnr',
 u'sc_iEtaMod5',
 'iPhiMod2_sc',
 u'ele_ecalDrivenSeed',
 u'ssFull_eRight',
 u'ssFull_e3x3',
 u'ssFull_eTop',
 u'sc_rawESEnergy',
 u'sc_iPhiMod2',
 u'ele_et',
 u'sc_iEtaMod20',
 u'sc_isEB',
 u'sc_seedClusEnergy']

In [52]:
list(set(barrel_vars).difference(set(mydf.columns)))

['iEtaMod20_sc', 'iEtaMod5_sc', 'iPhiMod2_sc', 'iPhiOrY_sc']

In [149]:
len(list(set(mydf.columns).intersection(set(barrel_vars))))

35

### Define a barrel dataframe with variables important for training (using a dedicated notebook), and store this dataframe.

In [163]:
final_barrel_df = barrel_df[barrel_df.columns & barrel_vars]

In [164]:
usage()

13600.6875

In [165]:
final_barrel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8604800 entries, 0 to 14985167
Data columns (total 36 columns):
rho                                    float32
sc_rawEnergy                           float32
sc_rawESEnergy                         float32
sc_etaWidth                            float32
sc_phiWidth                            float32
sc_numberOfClusters                    float32
sc_dEtaSeedSC                          float32
sc_dPhiSeedSC                          float32
sc_iEtaOrX                             float32
sc_iPhiOrY                             float32
sc_iEtaMod5                            float32
sc_iPhiMod2                            float32
sc_iEtaMod20                           float32
sc_iPhiMod20                           float32
ssFull_sigmaIEtaIEta                   float32
ssFull_sigmaIEtaIPhi                   float32
ssFull_sigmaIPhiIPhi                   float32
ele_hademTow                           float32
ele_nrSatCrys                          f

In [166]:
final_barrel_df.describe()

Unnamed: 0,rho,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_iEtaOrX,sc_iPhiOrY,sc_iEtaMod5,sc_iPhiMod2,sc_iEtaMod20,sc_iPhiMod20,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ele_hademTow,ele_nrSatCrys,mc_energy,sc_seedClusEnergy_over_sc_rawEnergy,ssFull_e5x5_over_sc_rawEnergy,ssFull_e3x3_over_sc_rawEnergy,ssFull_eMax_over_ssFull_e5x5,ssFull_e2nd_over_ssFull_e5x5,ssFull_eTop_over_ssFull_e5x5,ssFull_eBottom_over_ssFull_e5x5,ssFull_eLeft_over_ssFull_e5x5,ssFull_eRight_over_ssFull_e5x5,ssFull_e2x5Max_over_ssFull_e5x5,ssFull_e2x5Left_over_ssFull_e5x5,ssFull_e2x5Right_over_ssFull_e5x5,ssFull_e2x5Top_over_ssFull_e5x5,ssFull_e2x5Bottom_over_ssFull_e5x5,meas_energy,Emc_over_Emeas
count,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0,8604800.0
mean,19.93312,208.8408,0.0,0.008282442,0.02267043,2.819328,-6.00056e-07,-8.303765e-06,-0.1041085,180.7342,0.0008743957,0.5004235,-0.008133949,9.514298,0.008967754,-0.003373731,0.01245964,0.003639736,0.0,215.6655,0.9556687,0.971935,0.9127039,0.58147,0.184844,0.09064155,0.09060916,0.05227534,0.05290156,0.9520532,0.1033461,0.1046839,0.154171,0.1537542,208.8408,1.078456
std,12.76103,133.6567,0.0,0.00181099,0.01767361,2.05348,0.00130278,0.006644111,48.94718,104.5375,2.436248,0.4999999,10.70132,5.811683,0.0009460002,0.09930093,0.002934863,0.3363293,0.0,136.2011,0.09621,0.1575218,0.1439846,0.1402599,0.09465282,0.09330297,0.09329543,0.07224157,0.07229788,0.01397212,0.1081271,0.1084513,0.1269594,0.1269048,133.6567,1.258465
min,0.0,0.2808963,0.0,0.0002126136,4.063789e-05,1.0,-0.09987355,-0.3811861,-85.0,1.0,-4.0,0.0,-19.0,0.0,0.0,-1.0,0.0002690318,0.0,0.0,1.120957,0.1644615,0.1988927,0.1545707,0.005239881,0.004797683,0.0,0.0,0.0,0.0,0.1298138,0.0,0.0,0.0,0.0,0.2808963,0.1287299
25%,9.218432,100.2411,0.0,0.00721753,0.01040706,1.0,-0.0001002429,-0.0005385262,-42.0,90.0,-2.0,0.0,-9.0,4.0,0.008719118,-0.05687158,0.01033538,0.0,0.0,105.938,0.96198,0.9679362,0.9021573,0.4708541,0.1048821,0.02210198,0.02204533,0.01086841,0.01112711,0.9455189,0.03401419,0.03469518,0.05524228,0.05486086,100.2411,1.005088
50%,19.37348,195.4058,0.0,0.008037896,0.01705686,2.0,0.0,0.0,-1.0,181.0,0.0,1.0,0.0,10.0,0.008992155,-0.001011613,0.01122502,0.0,0.0,202.0149,0.9940259,0.9915215,0.946832,0.5934177,0.1721211,0.05159103,0.05159243,0.02175028,0.02237645,0.9539108,0.05876791,0.06014754,0.1062083,0.1058485,195.4058,1.017478
75%,29.54667,292.626,0.0,0.009288809,0.02868558,4.0,9.807901e-05,0.0005250425,42.0,272.0,2.0,1.0,9.0,15.0,0.009309629,0.05199568,0.01413548,0.0,0.0,298.4509,1.0,0.998704,0.9608448,0.7020959,0.254047,0.1280296,0.1280553,0.05652317,0.05794043,0.9593377,0.123492,0.1259224,0.2189473,0.2184005,292.626,1.044554
max,84.82404,686.2073,0.0,0.1966752,0.4416792,22.0,0.1021055,0.3366225,85.0,360.0,4.0,1.0,19.0,19.0,0.02663168,1.0,0.03114974,240.4923,0.0,709.7875,1.0,174.7949,123.442,0.9563972,0.4878334,0.7932374,0.7440041,0.7612672,0.7702944,1.0,0.9795076,0.9027611,0.9604359,0.9627705,686.2073,711.4703


In [167]:
usage()

13601.58203125

In [168]:
final_barrel_df.isnull().values.any()

False

In [76]:
final_barrel_df.columns = final_barrel_df.columns.astype(str)

### Store the final barrel dataframe.

In [169]:
%time
print 'Write this final barrel dataframe to disk'

print 'Processing feather'
feather.write_feather(final_barrel_df, 'final_barrel_df.feather')

print 'Processing parquet'
final_barrel_df.to_parquet('final_barrel_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

CPU times: user 18 µs, sys: 5 µs, total: 23 µs
Wall time: 41 µs
Write this final barrel dataframe to disk
Processing feather
Processing parquet
All done!


In [170]:
usage()

13610.703125

### Repeat the same logic with endcap: subset endcap information and store it.

In [171]:
endcap_df = mydf[(mydf.sc_isEB==0)]

In [172]:
endcap_df.sc_isEB.max()

0.0

In [178]:
endcap_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6380367 entries, 7 to 14985165
Data columns (total 62 columns):
rho                                    float32
evt_eventnr                            int32
sc_rawEnergy                           float32
sc_rawESEnergy                         float32
sc_etaWidth                            float32
sc_phiWidth                            float32
sc_seedClusEnergy                      float32
sc_numberOfClusters                    float32
sc_dEtaSeedSC                          float32
sc_dPhiSeedSC                          float32
sc_isEB                                float32
sc_iEtaOrX                             float32
sc_iPhiOrY                             float32
sc_iEtaMod5                            float32
sc_iPhiMod2                            float32
sc_iEtaMod20                           float32
sc_iPhiMod20                           float32
ssFull_e3x3                            float32
ssFull_e5x5                            flo

### We do not need anymore barrel dataframes in this notebook, can safely remove them from the RAM.

In [174]:
del barrel_df

In [175]:
usage()

13094.02734375

In [176]:
del final_barrel_df

In [177]:
usage()

11912.3359375

In [179]:
endcap_df.columns = endcap_df.columns.astype(str)
print 'Write this big endcap dataframe to disk'

print 'Processing feather'
feather.write_feather(endcap_df, 'big_endcap_df.feather')

print 'Processing parquet'
endcap_df.to_parquet('big_endcap_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

Write this big endcap dataframe to disk
Processing feather
Processing parquet
All done!


In [180]:
usage()

11918.03515625

In [181]:
final_endcap_df = endcap_df[endcap_df.columns & endcap_vars]

In [182]:
final_endcap_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6380367 entries, 7 to 14985165
Data columns (total 33 columns):
rho                                    float32
sc_rawEnergy                           float32
sc_rawESEnergy                         float32
sc_etaWidth                            float32
sc_phiWidth                            float32
sc_numberOfClusters                    float32
sc_dEtaSeedSC                          float32
sc_dPhiSeedSC                          float32
sc_iEtaOrX                             float32
sc_iPhiOrY                             float32
ssFull_sigmaIEtaIEta                   float32
ssFull_sigmaIEtaIPhi                   float32
ssFull_sigmaIPhiIPhi                   float32
ele_hademTow                           float32
ele_nrSatCrys                          float32
mc_energy                              float32
sc_seedClusEnergy_over_sc_rawEnergy    float32
ssFull_e5x5_over_sc_rawEnergy          float32
ssFull_e3x3_over_sc_rawEnergy          f

In [183]:
final_endcap_df.describe()

Unnamed: 0,rho,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_iEtaOrX,sc_iPhiOrY,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ele_hademTow,ele_nrSatCrys,mc_energy,sc_seedClusEnergy_over_sc_rawEnergy,ssFull_e5x5_over_sc_rawEnergy,ssFull_e3x3_over_sc_rawEnergy,ssFull_eMax_over_ssFull_e5x5,ssFull_e2nd_over_ssFull_e5x5,ssFull_eTop_over_ssFull_e5x5,ssFull_eBottom_over_ssFull_e5x5,ssFull_eLeft_over_ssFull_e5x5,ssFull_eRight_over_ssFull_e5x5,ssFull_e2x5Max_over_ssFull_e5x5,ssFull_e2x5Left_over_ssFull_e5x5,ssFull_e2x5Right_over_ssFull_e5x5,ssFull_e2x5Top_over_ssFull_e5x5,ssFull_e2x5Bottom_over_ssFull_e5x5,sc_rawESEnergy_over_sc_rawEnergy,meas_energy,Emc_over_Emeas
count,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0,6380367.0
mean,20.02875,605.9588,17.22951,0.01685951,0.02594809,1.911485,-6.231104e-06,-2.135019e-06,50.53819,50.53917,0.0242384,-0.02037881,0.03051924,0.01396778,0.0,641.2155,0.9777978,0.9983891,0.9456851,0.6084775,0.1722936,0.06665095,0.06658509,0.06544929,0.06548342,0.9310262,0.1241894,0.1242414,0.1255324,0.1253971,0.04138204,623.1884,1.151106
std,12.77826,411.7969,12.8373,0.005377533,0.01277936,1.203495,0.001399998,0.005886765,22.07478,22.02891,0.003243336,0.2362239,0.008599845,0.5336924,0.0,417.2255,0.06641851,0.7227359,0.4567427,0.1453626,0.09566692,0.08206116,0.08208965,0.08086134,0.08089892,0.04815415,0.1169375,0.1169364,0.1177397,0.1176892,0.0459239,417.6558,2.477922
min,0.0,0.7550039,0.0,8.671309e-05,1.403324e-06,1.0,-0.2552902,-0.2876483,1.0,1.0,0.0005438504,-0.9974241,0.0003754915,0.0,0.0,2.42204,0.1797029,0.09955423,0.09955423,0.001882909,0.001653227,0.0,0.0,0.0,0.0,0.03797474,0.0,0.0,0.0,0.0,0.0,0.7550039,0.02284453
25%,9.314892,272.8871,7.855617,0.01293468,0.01893865,1.0,-1.793936e-05,-8.068089e-05,33.0,33.0,0.02278456,-0.1887045,0.02474766,0.0,0.0,306.299,0.9870287,0.988948,0.9382826,0.495566,0.09141396,0.01420182,0.0141795,0.01396071,0.01397697,0.9256333,0.04115809,0.04119439,0.04167926,0.04164112,0.01583423,285.2552,1.002864
50%,19.49312,548.071,15.66993,0.01617286,0.02321593,2.0,0.0,0.0,51.0,51.0,0.0239245,-0.02344828,0.02726633,0.001603853,0.0,586.2535,0.9990657,0.9985538,0.9620269,0.6293145,0.1551048,0.03093261,0.03086119,0.03039867,0.03043937,0.9445233,0.07727589,0.07734247,0.07814413,0.07799799,0.03060205,565.4055,1.018023
75%,29.65118,857.6479,24.73589,0.02020009,0.02885893,2.0,1.611407e-05,7.099856e-05,68.0,68.0,0.02505134,0.1425038,0.03377106,0.004809177,0.0,896.9975,1.0,1.00626,0.9741447,0.7318827,0.2405415,0.08333041,0.08312925,0.08167752,0.08156478,0.9559149,0.1657324,0.1658197,0.1676858,0.167489,0.05297057,882.9196,1.048977
max,91.60725,2480.208,121.029,0.3414597,0.8972858,54.0,0.2174096,0.3975364,100.0,100.0,0.08180051,0.9994417,0.09596091,234.0993,0.0,2695.155,1.0,600.6783,278.2103,0.9718506,0.8007877,0.798232,0.8201018,0.813609,0.7983075,1.0,0.9856412,0.9903917,0.9904305,0.9856783,5.267301,2480.208,829.7859


In [184]:
final_endcap_df.isnull().values.any()

False

In [185]:
final_endcap_df.columns = final_endcap_df.columns.astype(str)

In [187]:
final_endcap_df

Unnamed: 0,rho,sc_rawEnergy,sc_rawESEnergy,sc_etaWidth,sc_phiWidth,sc_numberOfClusters,sc_dEtaSeedSC,sc_dPhiSeedSC,sc_iEtaOrX,sc_iPhiOrY,ssFull_sigmaIEtaIEta,ssFull_sigmaIEtaIPhi,ssFull_sigmaIPhiIPhi,ele_hademTow,ele_nrSatCrys,mc_energy,sc_seedClusEnergy_over_sc_rawEnergy,ssFull_e5x5_over_sc_rawEnergy,ssFull_e3x3_over_sc_rawEnergy,ssFull_eMax_over_ssFull_e5x5,ssFull_e2nd_over_ssFull_e5x5,ssFull_eTop_over_ssFull_e5x5,ssFull_eBottom_over_ssFull_e5x5,ssFull_eLeft_over_ssFull_e5x5,ssFull_eRight_over_ssFull_e5x5,ssFull_e2x5Max_over_ssFull_e5x5,ssFull_e2x5Left_over_ssFull_e5x5,ssFull_e2x5Right_over_ssFull_e5x5,ssFull_e2x5Top_over_ssFull_e5x5,ssFull_e2x5Bottom_over_ssFull_e5x5,sc_rawESEnergy_over_sc_rawEnergy,meas_energy,Emc_over_Emeas
7,17.500393,107.924393,8.423841,0.012864,0.047712,3.0,-0.002168,6.474053e-03,20.0,27.0,0.021894,-0.061254,0.055772,0.000000,0.0,128.248749,0.763299,0.956247,0.762141,0.496913,0.134050,0.042185,0.017268,0.134050,0.019501,0.820936,0.358051,0.075799,0.265479,0.077578,0.078053,116.348236,1.102284
8,17.500393,108.024261,11.480968,0.014954,0.044356,3.0,0.001522,-3.945758e-03,82.0,74.0,0.022739,0.035947,0.037725,0.000000,0.0,128.248749,0.952532,0.945846,0.884405,0.435059,0.313085,0.044734,0.028915,0.013531,0.313085,0.926243,0.039830,0.442990,0.086960,0.145435,0.106281,119.505226,1.073164
15,20.662251,414.189697,23.596441,0.014047,0.027942,5.0,-0.000815,-1.600069e-03,79.0,72.0,0.023977,0.038398,0.029885,0.000000,0.0,435.324951,0.984393,0.988749,0.947156,0.745048,0.052869,0.026008,0.052869,0.027238,0.052260,0.916417,0.073706,0.094511,0.061845,0.103743,0.056970,437.786133,0.994378
16,20.662251,416.708496,10.159088,0.014249,0.017439,2.0,0.000133,4.568984e-04,22.0,30.0,0.025463,0.289535,0.022554,0.000000,0.0,435.324951,0.998064,1.000397,0.969539,0.566011,0.236400,0.009632,0.236400,0.063468,0.010395,0.957872,0.149547,0.031182,0.021386,0.333354,0.024379,426.867584,1.019813
17,21.473417,55.558941,5.046361,0.010976,0.048473,4.0,-0.002194,1.183567e-03,23.0,83.0,0.028099,-0.296239,0.045582,0.000000,0.0,72.982941,0.929340,0.940300,0.858293,0.757637,0.058510,0.022232,0.058510,0.019532,0.000000,0.924664,0.088491,0.056205,0.064812,0.158020,0.090829,60.605301,1.204234
18,21.473417,56.471313,3.622597,0.008810,0.012283,1.0,0.000000,0.000000e+00,75.0,17.0,0.022231,0.038194,0.029640,0.000000,0.0,72.982941,1.000000,1.037938,0.988445,0.697057,0.088912,0.085239,0.015435,0.014109,0.088912,0.939411,0.045717,0.145400,0.139298,0.049250,0.064149,60.093910,1.214481
23,34.537971,52.768269,12.165409,0.013427,0.045925,3.0,0.000732,2.427212e-02,33.0,21.0,0.027735,-0.149533,0.040608,0.000000,0.0,75.451454,0.649225,0.861515,0.616603,0.272165,0.200463,0.055002,0.000000,0.174173,0.006913,0.756154,0.621901,0.025099,0.535380,0.000000,0.230544,64.933678,1.161977
24,34.537971,67.908142,9.261819,0.017378,0.033213,2.0,0.000875,3.674139e-03,70.0,77.0,0.033873,-0.229690,0.038855,0.025398,0.0,75.451454,0.981718,1.024301,0.904198,0.590209,0.078184,0.078184,0.023915,0.059919,0.034098,0.851174,0.212026,0.081395,0.195841,0.083890,0.136387,77.169960,0.977731
27,9.288201,98.426880,8.117278,0.015359,0.016453,1.0,0.000000,0.000000e+00,67.0,63.0,0.024752,0.069475,0.027962,0.000000,0.0,111.682686,1.000000,1.036042,1.004620,0.772702,0.081282,0.015572,0.033359,0.024006,0.081282,0.944981,0.053170,0.125198,0.030984,0.081812,0.082470,106.544159,1.048229
28,9.288201,98.425285,6.405208,0.023443,0.028715,1.0,0.000000,0.000000e+00,34.0,39.0,0.028889,-0.015194,0.037591,0.013782,0.0,111.682686,1.000000,1.015834,0.957223,0.537257,0.243954,0.021863,0.056980,0.243954,0.006571,0.957306,0.360730,0.023169,0.102085,0.109104,0.065077,104.830490,1.065364


In [189]:
print 'Write this final endcap dataframe to disk'

print 'Processing feather'
feather.write_feather(final_endcap_df, 'final_endcap_df.feather')

print 'Processing parquet'
final_endcap_df.to_parquet('final_endcap_df.parquet', engine='fastparquet', compression='gzip')

print 'All done!'

Write this final endcap dataframe to disk
Processing feather
Processing parquet
All done!


In [188]:
usage()

12723.203125

In [190]:
mydf.shape

(14985167, 62)

In [191]:
del mydf


In [199]:
usage()

5851.421875

In [200]:
final_endcap_df.shape

(6380367, 33)

In [201]:
del endcap_df
del final_endcap_df

In [202]:
usage()

4365.05078125

In [204]:
del barrel_vars, endcap_vars


4365.09375

In [205]:
usage()

4365.09375

In [206]:
whos

Variable         Type        Data/Info
--------------------------------------
feather          module      <module 'pyarrow.feather'<...>ges/pyarrow/feather.pyc'>
garbageCollect   function    <function garbageCollect at 0x7ff13170b320>
gc               module      <module 'gc' (built-in)>
name             str         _ih
np               module      <module 'numpy' from '/cv<...>ages/numpy/__init__.pyc'>
os               module      <module 'os' from '/cvmfs<...>pt/lib/python2.7/os.pyc'>
pd               module      <module 'pandas' from '/c<...>ges/pandas/__init__.pyc'>
psutil           module      <module 'psutil' from '/c<...>ges/psutil/__init__.pyc'>
usage            function    <function usage at 0x7ff13170b1b8>


In [207]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [209]:
whos


Interactive namespace is empty.


In [210]:
for name in vars().keys():
    print(name)

_i
_sh
_dh
_i209
_i208
__builtins__
exit
_i210
__name__
__builtin__
quit
_iii
_oh
Out
get_ipython
In
_ii
_ih
