# Analysis of DMR++ Information

In [1]:
import pandas as pd
from bokeh.charts import output_notebook, output_file, show, Scatter, Histogram, TimeSeries
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Range1d, HoverTool, ResizeTool, FixedTicker
output_notebook()

In [2]:
dmrpp = pd.read_csv('../../../dmrpp_stats.csv')

In [3]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 7 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
dtypes: bool(1), int64(2), object(4)
memory usage: 379.5+ MB


In [4]:
dmrpp.head(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
0,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/TropHeight_TqJ_D_max,True,8c3b0087-973a-4073-8215-8490fd26065c,d4a492ad98ae783145c9286cf69e5b8a,283428105,149944
1,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/CoarseCloudFrc_TqJ_D_ct,True,065860ce-ebd0-465a-8512-bf7d56557a58,0555723dc275296109273ecf6bb8fe64,294468920,112388
2,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/O3_VMR_TqJ_D_min,True,9752def8-fd3a-462f-9f86-700a2790b89d,bcaf632733e1ef9ad9a281589c02c4dc,301573840,233452


In [5]:
dmrpp.tail(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
8120475,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA1/Data Fields/Temperature,False,38c4ac4b-8f4e-414e-a6c3-5342bacd0a60,ac971f91f30b76bc1b481501b0a53542,40696,32
8120476,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA2/Data Fields/Temperature,False,1c73591f-e26c-402a-8429-0941e78a9708,d709add9a8fd30f7432e76d43def1494,40776,128
8120477,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA1/Data Fields/Latitude,False,cf98febc-e0ea-4cac-ad5c-423e91ddbca8,6d8ca9f67d9914dd04b31533021b7e17,40680,16


Create a new column for file types:

In [6]:
def file_type(v):
    if v.startswith('AIRS'):
        return 'airs'
    elif v.startswith('MERRA2'):
        return 'merra'
    else:
        return 'sample'

In [7]:
dmrpp['Type'] = dmrpp['File'].map(file_type)

What we have now?

In [8]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 8 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
Type          object
dtypes: bool(1), int64(2), object(5)
memory usage: 441.4+ MB


## How many records for each file type?

In [9]:
grp = dmrpp.groupby('Type')

In [10]:
grp.size()

Type
airs       569765
merra     7494823
sample      55890
dtype: int64

## Analysis: AIRS

Select only the AIRS data:

In [11]:
airs = dmrpp[dmrpp.Type == 'airs']

In [12]:
airs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569765 entries, 0 to 569764
Data columns (total 8 columns):
File          569765 non-null object
Dataset       569765 non-null object
Chunk_Flag    569765 non-null bool
UUID          569765 non-null object
Checksum      569765 non-null object
Offset        569765 non-null int64
Size          569765 non-null int64
Type          569765 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 35.3+ MB


### Byte Streams

#### How many byte streams with unique content?

In [13]:
x = airs.Checksum.value_counts()

In [14]:
len(x.index)

330353

#### Which are the Top 20 byte streams based on content?

In [15]:
x.sort_values(ascending=False).head(20)

c55324594a0787856d9b908fc7a7a201    49640
a43f0ca2084e297f3aa9763a828dcb0f     6570
13194f25768e29a4921fa9f6ba022342     1460
7c90147b4fcb72404dd36c4b0d82051d     1460
7b744463b12501af7f1b66dece5e41b4      730
6c9db2946c92fcf6e0d9f8c3bb49d1ff      365
19c03d94e619f2f3d8ad1428c6c1389c      365
cce9f79f6073bdaf715f1a5a07bf72ec      365
5bbb0bd53711bae49dc9e3c6b37bebd6      365
f6da25808f22177cef49588cc08740ca      365
cd581cc833cabac662ad43499330bed6      365
e03cc92919bb584379ea825173440b1c      365
a71f8216cff7fca9d2c00e768589efdd      365
0b2396e0349f169f6f5e16650f4063c5      365
61f33d4ee48f21fef8b6ce240dbf12c2      365
08676799dc49855f9d71198e0c49d1ec      365
9cb1ca22f2c372217dd18cd55f0f1e4f       15
c1c4bba0373b08baee01c39774c97d42       15
445c789e45bf87283f4b443bbe22e8f1       15
0b35945a25e243ca38a2eb89d4ccc002       15
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [16]:
cksum = 'c55324594a0787856d9b908fc7a7a201'

In [17]:
grp = airs.groupby('Checksum')

Per dataset across all AIRS files that have it:

In [18]:
grp.get_group(cksum)['Dataset'].value_counts()

/Temperature_MW_D_sdev    2920
/GPHeight_MW_A_sdev       2920
/GPHeight_MW_D            2920
/Temperature_MW_D_min     2920
/GPHeight_MW_D_max        2920
/Temperature_MW_D_max     2920
/Temperature_MW_A_sdev    2920
/GPHeight_MW_D_sdev       2920
/GPHeight_MW_A            2920
/GPHeight_MW_D_min        2920
/Temperature_MW_A_max     2920
/GPHeight_MW_A_max        2920
/Temperature_MW_A_min     2920
/Temperature_MW_D         2920
/GPHeight_MW_A_min        2920
/Temperature_MW_A         2920
/Emis_MW_D_max             365
/Emis_MW_A                 365
/Emis_MW_D_sdev            365
/Emis_MW_A_max             365
/Emis_MW_A_min             365
/Emis_MW_D_min             365
/Emis_MW_D                 365
/Emis_MW_A_sdev            365
Name: Dataset, dtype: int64

Per file including all its datasets:

In [19]:
grp.get_group(cksum)['File'].value_counts()

AIRS.2015.09.23.L3.RetStd_IR001.v6.0.31.0.G15281123841.nc.h5    136
AIRS.2015.03.16.L3.RetStd_IR001.v6.0.11.0.G15076184035.nc.h5    136
AIRS.2015.09.17.L3.RetStd_IR001.v6.0.31.0.G15265195548.nc.h5    136
AIRS.2015.05.15.L3.RetStd_IR001.v6.0.31.0.G15213005119.nc.h5    136
AIRS.2015.12.15.L3.RetStd_IR001.v6.0.31.0.G15350164838.nc.h5    136
AIRS.2015.07.10.L3.RetStd_IR001.v6.0.31.0.G15192172639.nc.h5    136
AIRS.2015.12.25.L3.RetStd_IR001.v6.0.31.0.G15362153615.nc.h5    136
AIRS.2015.05.23.L3.RetStd_IR001.v6.0.31.0.G15225104912.nc.h5    136
AIRS.2015.04.13.L3.RetStd_IR001.v6.0.31.0.G15190185721.nc.h5    136
AIRS.2015.04.21.L3.RetStd_IR001.v6.0.31.0.G15197132455.nc.h5    136
AIRS.2015.06.16.L3.RetStd_IR001.v6.0.31.0.G15208195104.nc.h5    136
AIRS.2015.06.19.L3.RetStd_IR001.v6.0.31.0.G15211041208.nc.h5    136
AIRS.2015.03.26.L3.RetStd_IR001.v6.0.31.0.G15187102922.nc.h5    136
AIRS.2015.11.18.L3.RetStd_IR001.v6.0.31.0.G15323173646.nc.h5    136
AIRS.2015.07.22.L3.RetStd_IR001.v6.0.31.0.G15205

#### What is the total number of byte streams in the AIRS files?

In [20]:
x.sum()

569765

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [21]:
100 * (len(x.index)/x.sum() - 1)

-42.019429062859246

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [22]:
before = airs.Size.sum()
before

114762412036

In [23]:
after = airs.drop_duplicates('Checksum')['Size'].sum()
after

80984748175

In [24]:
100 * (after/before - 1)

-29.432689032715899

#### What is the size statistics for unique byte streams?

In [25]:
p = Histogram(airs.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [26]:
airs.drop_duplicates('Checksum')['Size'].describe()

count    3.303530e+05
mean     2.451461e+05
std      2.341771e+05
min      2.000000e+01
25%      8.419200e+04
50%      1.922770e+05
75%      2.363170e+05
max      1.122311e+06
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [27]:
grp = airs.groupby('File')

In [28]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    365.0
mean     777.0
std        0.0
min      777.0
25%      777.0
50%      777.0
75%      777.0
max      777.0
Name: Dataset, dtype: float64

#### Is every dataset in every AIRS file chunked?

In [29]:
grp = airs.groupby(['File', 'Dataset'])

In [30]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [31]:
grp.UUID.count().describe()

count    283605.000000
mean          2.009009
std           2.458655
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           8.000000
Name: UUID, dtype: float64

In [32]:
p = Histogram(grp.UUID.count(), bins=[.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(9)))
show(p)

## Analysis: `MERRA2_400.tavgM_2d_slv_Nx`

Select only DMR++ entries for these files:

In [33]:
merra2_tavg = dmrpp[dmrpp.File.str.contains('MERRA2_400.tavgM_2d_slv_Nx')]

In [34]:
merra2_tavg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82639 entries, 7981949 to 8064587
Data columns (total 8 columns):
File          82639 non-null object
Dataset       82639 non-null object
Chunk_Flag    82639 non-null bool
UUID          82639 non-null object
Checksum      82639 non-null object
Offset        82639 non-null int64
Size          82639 non-null int64
Type          82639 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 5.1+ MB


### Byte Streams

#### How many byte streams with unique content?

In [35]:
x = merra2_tavg.Checksum.value_counts()

In [36]:
len(x.index)

81057

#### Which are the Top 20 byte streams based on content?

In [37]:
x.sort_values(ascending=False).head(20)

d57d72206d7fb5fad96714288dd86f29    108
6ed95e16d4b2f755e1ff226faedac1d1     54
dc51ab56717a3cae07a2ef00aa155512     54
08044720030d15a8aa892941bdf6a40e     54
3a862ab7992a90ec1f5e7999717f7c7d     54
196be26f3d971e3c37190636133fee49     54
14fcc6e1b10445175cd974c68f575a86     54
88ccb77821816d00ce02b679bd54523f     54
f1d3ff8443297732862df21dc4e57262     13
03e10d25b09a490f92d2b279c0340d4c     13
adb99ec2a68b1d16fc9cd2f1d33e983d     13
7e9dc3be25b28304a270e09e299364d5      5
fda0be3e6ff2d01c17c367cf6e83617d      5
99208f172d6213168d67872b8f861a5e      5
99ca2a29675d2c7c4212ff393f2b2ee5      5
4da191da9a03cbb2e34811a25be48775      5
11b1ae795c90919ee50a56e9ed59b3b6      5
c05bcc42545eb9d4c54b02eeff44da8d      5
4cdf7ae7885489f9acec788b61f666b6      5
76dd3d52a5f347523dc4222f7e3d9497      5
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [38]:
cksum = 'd57d72206d7fb5fad96714288dd86f29'

In [39]:
grp = merra2_tavg.groupby('Checksum')

Per dataset across all the files that have it:

In [40]:
grp.get_group(cksum)['Dataset'].value_counts()

/DISPH        54
/Var_DISPH    54
Name: Dataset, dtype: int64

Per file including all its datasets:

In [41]:
grp.get_group(cksum)['File'].value_counts().sort_index()

MERRA2_400.tavgM_2d_slv_Nx.201101.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201102.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201103.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201104.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201105.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201106.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201107.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201108.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201109.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201110.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201111.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201112.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201201.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201202.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201203.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201204.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201205.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201206.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201207.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201208.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201209.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201210.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201211.nc4    2
MERRA2_400.

#### What is the total number of byte streams in the files?

In [42]:
x.sum()

82639

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [43]:
100 * (len(x.index)/x.sum() - 1)

-1.9143503672600093

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [44]:
before = merra2_tavg.Size.sum()
before

3912188540

In [45]:
after = merra2_tavg.drop_duplicates('Checksum')['Size'].sum()
after

3891337556

In [46]:
100 * (after/before - 1)

-0.53297492661230628

#### What is the size statistics for unique byte streams?

In [47]:
p = Histogram(merra2_tavg.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [48]:
merra2_tavg.drop_duplicates('Checksum')['Size'].describe()

count     81057.000000
mean      48007.421395
std       97027.972354
min           4.000000
25%       33161.000000
50%       37944.000000
75%       40204.000000
max      831744.000000
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [49]:
grp = merra2_tavg.groupby('File')

In [50]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    67.0
mean     97.0
std       0.0
min      97.0
25%      97.0
50%      97.0
75%      97.0
max      97.0
Name: Dataset, dtype: float64

#### Is every dataset in every file chunked?

In [51]:
grp = merra2_tavg.groupby(['File', 'Dataset'])

In [52]:
grp.Chunk_Flag.agg(all).all()

False

#### How many byte streams per dataset?

In [53]:
grp.UUID.count().describe()

count    6499.000000
mean       12.715649
std         6.203566
min         1.000000
25%        16.000000
50%        16.000000
75%        16.000000
max        16.000000
Name: UUID, dtype: float64

In [54]:
max_num_chunk = 16
p = Histogram(grp.UUID.count(), 
              bins=list(map(lambda x: x + 0.5, range(0, max_num_chunk+1))),
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(max_num_chunk+1)))
show(p)