# Analysis of DMR++ Information

In [1]:
import pandas as pd
from bokeh.charts import output_notebook, output_file, show, Scatter, Histogram, TimeSeries
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Range1d, HoverTool, ResizeTool, FixedTicker
output_notebook()

In [2]:
dmrpp = pd.read_csv('../../../dmrpp_stats.csv')

In [3]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 7 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
dtypes: bool(1), int64(2), object(4)
memory usage: 379.5+ MB


In [4]:
dmrpp.head(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
0,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/TropHeight_TqJ_A_sdev,True,2ab4f832-3400-40b3-a3f7-f4d234610b30,9ce93facba2aee55317040200b3745b6,220558115,83949
1,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/Temperature_A_sdev,True,042cb853-02a7-4f48-a492-67c972f14f15,68e89dda59f453c3c91dfd60e2a0bd75,72436482,230741
2,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/Temperature_A_sdev,True,797f55db-56f7-455d-b2ee-064811ebea4b,2a5ea486540dac3b9bdfcbaa621c0e43,1740209,232814


In [5]:
dmrpp.tail(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
8120475,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA2/Data Fields/Temperature,False,1c73591f-e26c-402a-8429-0941e78a9708,d709add9a8fd30f7432e76d43def1494,40776,128
8120476,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA1/Data Fields/Pressure,False,cdf9ca2e-7dd2-4bb4-9af8-24108e51ac0c,b5ce7b2e71f14b237f1d8a6c2dacd247,40672,8
8120477,za_2_2d_yz.h5,/HDFEOS INFORMATION/StructMetadata.0,False,b35ec1ca-29cc-4e4d-9df6-465dc2c0e794,dca06d8dadcf1a6a679c4fa50ecf4972,5304,32000


Create a new column for file types:

In [6]:
def file_type(v):
    if v.startswith('AIRS'):
        return 'airs'
    elif v.startswith('MERRA2'):
        return 'merra'
    else:
        return 'sample'

In [7]:
dmrpp['Type'] = dmrpp['File'].map(file_type)

What we have now?

In [8]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 8 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
Type          object
dtypes: bool(1), int64(2), object(5)
memory usage: 441.4+ MB


## How many records for each file type?

In [9]:
grp = dmrpp.groupby('Type')

In [10]:
grp.size()

Type
airs       569765
merra     7494823
sample      55890
dtype: int64

## Analysis: AIRS

Select only the AIRS data:

In [11]:
airs = dmrpp[dmrpp.Type == 'airs']

In [12]:
airs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569765 entries, 0 to 569764
Data columns (total 8 columns):
File          569765 non-null object
Dataset       569765 non-null object
Chunk_Flag    569765 non-null bool
UUID          569765 non-null object
Checksum      569765 non-null object
Offset        569765 non-null int64
Size          569765 non-null int64
Type          569765 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 35.3+ MB


### Byte Streams

#### How many byte streams with unique content?

In [13]:
x = airs.Checksum.value_counts()

In [14]:
len(x.index)

330353

#### Which are the Top 20 byte streams based on content?

In [15]:
x.sort_values(ascending=False).head(20)

c55324594a0787856d9b908fc7a7a201    49640
a43f0ca2084e297f3aa9763a828dcb0f     6570
13194f25768e29a4921fa9f6ba022342     1460
7c90147b4fcb72404dd36c4b0d82051d     1460
7b744463b12501af7f1b66dece5e41b4      730
f6da25808f22177cef49588cc08740ca      365
6c9db2946c92fcf6e0d9f8c3bb49d1ff      365
5bbb0bd53711bae49dc9e3c6b37bebd6      365
61f33d4ee48f21fef8b6ce240dbf12c2      365
cd581cc833cabac662ad43499330bed6      365
a71f8216cff7fca9d2c00e768589efdd      365
19c03d94e619f2f3d8ad1428c6c1389c      365
e03cc92919bb584379ea825173440b1c      365
0b2396e0349f169f6f5e16650f4063c5      365
08676799dc49855f9d71198e0c49d1ec      365
cce9f79f6073bdaf715f1a5a07bf72ec      365
85b0f24c7c9c485d9bbc030ea2d544d4       15
29978cb383d9e3a2cdef03cd72d10ab2       15
9a06b4a54eab7e6fa36f6838d5bc6a99       15
f71143769057ee3a3b25d7487e5bc3d2       15
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [16]:
cksum = 'c55324594a0787856d9b908fc7a7a201'

In [17]:
grp = airs.groupby('Checksum')

Per dataset across all AIRS files that have it:

In [18]:
grp.get_group(cksum)['Dataset'].value_counts()

/GPHeight_MW_A_min        2920
/Temperature_MW_A_min     2920
/Temperature_MW_D_sdev    2920
/Temperature_MW_D_min     2920
/Temperature_MW_A_max     2920
/GPHeight_MW_D_max        2920
/Temperature_MW_D_max     2920
/GPHeight_MW_D            2920
/Temperature_MW_A         2920
/GPHeight_MW_A            2920
/GPHeight_MW_D_sdev       2920
/GPHeight_MW_D_min        2920
/Temperature_MW_A_sdev    2920
/GPHeight_MW_A_sdev       2920
/GPHeight_MW_A_max        2920
/Temperature_MW_D         2920
/Emis_MW_A_min             365
/Emis_MW_A                 365
/Emis_MW_D                 365
/Emis_MW_D_min             365
/Emis_MW_A_sdev            365
/Emis_MW_D_max             365
/Emis_MW_D_sdev            365
/Emis_MW_A_max             365
Name: Dataset, dtype: int64

Per file including all its datasets:

In [19]:
grp.get_group(cksum)['File'].value_counts()

AIRS.2015.05.21.L3.RetStd_IR001.v6.0.31.0.G15220053350.nc.h5    136
AIRS.2015.05.04.L3.RetStd_IR001.v6.0.31.0.G15205031847.nc.h5    136
AIRS.2015.09.10.L3.RetStd_IR001.v6.0.31.0.G15254182904.nc.h5    136
AIRS.2015.02.25.L3.RetStd_IR001.v6.0.11.0.G15057220709.nc.h5    136
AIRS.2015.07.29.L3.RetStd_IR001.v6.0.31.0.G15212175054.nc.h5    136
AIRS.2015.05.09.L3.RetStd_IR001.v6.0.31.0.G15206170855.nc.h5    136
AIRS.2015.06.02.L3.RetStd_IR001.v6.0.31.0.G15199070943.nc.h5    136
AIRS.2015.02.23.L3.RetStd_IR001.v6.0.11.0.G15055193409.nc.h5    136
AIRS.2015.09.04.L3.RetStd_IR001.v6.0.31.0.G15251123039.nc.h5    136
AIRS.2015.01.15.L3.RetStd_IR001.v6.0.11.0.G15016175121.nc.h5    136
AIRS.2015.01.20.L3.RetStd_IR001.v6.0.11.0.G15021184554.nc.h5    136
AIRS.2015.12.04.L3.RetStd_IR001.v6.0.31.0.G15339182910.nc.h5    136
AIRS.2015.05.06.L3.RetStd_IR001.v6.0.31.0.G15205145040.nc.h5    136
AIRS.2015.11.12.L3.RetStd_IR001.v6.0.31.0.G15317210501.nc.h5    136
AIRS.2015.09.14.L3.RetStd_IR001.v6.0.31.0.G15259

#### What is the total number of byte streams in the AIRS files?

In [20]:
x.sum()

569765

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [21]:
100 * (len(x.index)/x.sum() - 1)

-42.019429062859246

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [22]:
before = airs.Size.sum()
before

114762412036

In [23]:
after = airs.drop_duplicates('Checksum')['Size'].sum()
after

80984748175

In [24]:
100 * (after/before - 1)

-29.432689032715899

#### What is the size statistics for unique byte streams?

In [25]:
p = Histogram(airs.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [26]:
airs.drop_duplicates('Checksum')['Size'].describe()

count    3.303530e+05
mean     2.451461e+05
std      2.341771e+05
min      2.000000e+01
25%      8.419200e+04
50%      1.922770e+05
75%      2.363170e+05
max      1.122311e+06
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [27]:
grp = airs.groupby('File')

In [28]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    365.0
mean     777.0
std        0.0
min      777.0
25%      777.0
50%      777.0
75%      777.0
max      777.0
Name: Dataset, dtype: float64

#### Is every dataset in every AIRS file chunked?

In [29]:
grp = airs.groupby(['File', 'Dataset'])

In [30]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [31]:
grp.UUID.count().describe()

count    283605.000000
mean          2.009009
std           2.458655
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           8.000000
Name: UUID, dtype: float64

In [32]:
p = Histogram(grp.UUID.count(), bins=[.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(9)))
show(p)

## Analysis: `MERRA2_400.tavgM_2d_slv_Nx`

Select only DMR++ entries for these files:

In [33]:
merra2_tavg = dmrpp[dmrpp.File.str.contains('MERRA2_400.tavgM_2d_slv_Nx')]

In [34]:
merra2_tavg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82639 entries, 7981949 to 8064587
Data columns (total 8 columns):
File          82639 non-null object
Dataset       82639 non-null object
Chunk_Flag    82639 non-null bool
UUID          82639 non-null object
Checksum      82639 non-null object
Offset        82639 non-null int64
Size          82639 non-null int64
Type          82639 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 5.1+ MB


### Byte Streams

#### How many byte streams with unique content?

In [35]:
x = merra2_tavg.Checksum.value_counts()

In [36]:
len(x.index)

81057

#### Which are the Top 20 byte streams based on content?

In [37]:
x.sort_values(ascending=False).head(20)

d57d72206d7fb5fad96714288dd86f29    108
3a862ab7992a90ec1f5e7999717f7c7d     54
88ccb77821816d00ce02b679bd54523f     54
14fcc6e1b10445175cd974c68f575a86     54
08044720030d15a8aa892941bdf6a40e     54
6ed95e16d4b2f755e1ff226faedac1d1     54
dc51ab56717a3cae07a2ef00aa155512     54
196be26f3d971e3c37190636133fee49     54
adb99ec2a68b1d16fc9cd2f1d33e983d     13
03e10d25b09a490f92d2b279c0340d4c     13
f1d3ff8443297732862df21dc4e57262     13
1e408d3959b79d5306aed99f32349e85      5
2f6085a69dfc7614c41098551e61837a      5
79257d601d89b6cd226c9a48b8d71dae      5
fb17b5a07f6f30208d6d6347992e3866      5
c2e47869aea0dc00ea6b1385eac9cd8b      5
f800b82f340c8ea2e61fc42115743f25      5
12a0d9a957b47592b3807470a6e0b510      5
863e03ddb849236789b8b94acb6791ac      5
6b5d1dbdb8b04c990bfc4ecb30b93831      5
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [38]:
cksum = 'd57d72206d7fb5fad96714288dd86f29'

In [39]:
grp = merra2_tavg.groupby('Checksum')

Per dataset across all the files that have it:

In [40]:
grp.get_group(cksum)['Dataset'].value_counts()

/DISPH        54
/Var_DISPH    54
Name: Dataset, dtype: int64

Per file including all its datasets:

In [41]:
grp.get_group(cksum)['File'].value_counts().sort_index()

MERRA2_400.tavgM_2d_slv_Nx.201101.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201102.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201103.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201104.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201105.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201106.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201107.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201108.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201109.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201110.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201111.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201112.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201201.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201202.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201203.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201204.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201205.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201206.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201207.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201208.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201209.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201210.nc4    2
MERRA2_400.tavgM_2d_slv_Nx.201211.nc4    2
MERRA2_400.

#### What is the total number of byte streams in the files?

In [42]:
x.sum()

82639

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [43]:
100 * (len(x.index)/x.sum() - 1)

-1.9143503672600093

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [44]:
before = merra2_tavg.Size.sum()
before

3912188540

In [45]:
after = merra2_tavg.drop_duplicates('Checksum')['Size'].sum()
after

3891337556

In [46]:
100 * (after/before - 1)

-0.53297492661230628

#### What is the size statistics for unique byte streams?

In [47]:
p = Histogram(merra2_tavg.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [48]:
merra2_tavg.drop_duplicates('Checksum')['Size'].describe()

count     81057.000000
mean      48007.421395
std       97027.972354
min           4.000000
25%       33161.000000
50%       37944.000000
75%       40204.000000
max      831744.000000
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [49]:
grp = merra2_tavg.groupby('File')

In [50]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    67.0
mean     97.0
std       0.0
min      97.0
25%      97.0
50%      97.0
75%      97.0
max      97.0
Name: Dataset, dtype: float64

#### Is every dataset in every file chunked?

In [51]:
grp = merra2_tavg.groupby(['File', 'Dataset'])

In [52]:
grp.Chunk_Flag.agg(all).all()

False

#### How many byte streams per dataset?

In [53]:
grp.UUID.count().describe()

count    6499.000000
mean       12.715649
std         6.203566
min         1.000000
25%        16.000000
50%        16.000000
75%        16.000000
max        16.000000
Name: UUID, dtype: float64

In [54]:
max_num_chunk = 16
p = Histogram(grp.UUID.count(), 
              bins=list(map(lambda x: x + 0.5, range(0, max_num_chunk+1))),
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(max_num_chunk+1)))
show(p)

## Analysis: `MERRA2_100.tavgM_2d_int_Nx`

Select only DMR++ entries for these files:

In [55]:
merra2_tavg = dmrpp[dmrpp.File.str.contains('MERRA2_100.tavgM_2d_int_Nx')]

In [56]:
merra2_tavg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 447408 entries, 1901864 to 2349271
Data columns (total 8 columns):
File          447408 non-null object
Dataset       447408 non-null object
Chunk_Flag    447408 non-null bool
UUID          447408 non-null object
Checksum      447408 non-null object
Offset        447408 non-null int64
Size          447408 non-null int64
Type          447408 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 27.7+ MB


### Byte Streams

#### How many byte streams with unique content?

In [57]:
x = merra2_tavg.Checksum.value_counts()

In [58]:
len(x.index)

413946

#### Which are the Top 20 byte streams based on content?

In [59]:
x.sort_values(ascending=False).head(20)

d57d72206d7fb5fad96714288dd86f29    11164
f6c9ef6c28c5715b914e5d7194b9f131     3902
6ed95e16d4b2f755e1ff226faedac1d1      144
dc51ab56717a3cae07a2ef00aa155512      144
196be26f3d971e3c37190636133fee49      144
1927537cb4f692e21debf2486bbc9826        2
b428ba31da08ceb36ee28fe89df1b127        2
ea3529b4fec026d143ab83095431d4df        2
19f71862af3ee04656e8d4b99aa8cf71        2
a8e627af78e472a1fbf54fb083bf4194        2
72a8fc2a38e7a0562ccdc0780ba980f3        2
5e6d44e659873254af8c81e09b65ee21        2
1bc7807f7dc4b4f66efb030399aadcd3        2
8f987e521323df4cb2aa8d4a05fee55f        2
93fe7058972097cffddd22d4b0e07899        2
c6e8826b0ff13d7aca90339525a923d1        2
7af80492bd4dfd4f86f33af1e66816f8        2
d2d1cb2302be0e85a000f6d89e1bd9c1        2
542ae7c2a697f0419bf622602a8b4687        2
bbfd613088f75c73175268749406a4ce        2
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [60]:
cksum = 'd57d72206d7fb5fad96714288dd86f29'

In [61]:
grp = merra2_tavg.groupby('Checksum')

Per dataset across all the files that have it:

In [62]:
grp.get_group(cksum)['Dataset'].value_counts()

/DQVDT_FIL        1728
/Var_DQIDT_FIL    1728
/Var_DOXDT_FIL    1728
/DQIDT_FIL        1728
/Var_DQVDT_FIL    1728
/DOXDT_FIL        1728
/PRECSN            160
/Var_PRECSN        160
/Var_QTFILL        119
/QTFILL            119
/DQLDT_FIL         119
/Var_DQLDT_FIL     119
Name: Dataset, dtype: int64

Per file including all its datasets:

In [63]:
grp.get_group(cksum)['File'].value_counts().sort_index()

MERRA2_100.tavgM_2d_int_Nx.198001.nc4    78
MERRA2_100.tavgM_2d_int_Nx.198002.nc4    78
MERRA2_100.tavgM_2d_int_Nx.198003.nc4    80
MERRA2_100.tavgM_2d_int_Nx.198004.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198005.nc4    82
MERRA2_100.tavgM_2d_int_Nx.198006.nc4    72
MERRA2_100.tavgM_2d_int_Nx.198007.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198008.nc4    88
MERRA2_100.tavgM_2d_int_Nx.198009.nc4    76
MERRA2_100.tavgM_2d_int_Nx.198010.nc4    78
MERRA2_100.tavgM_2d_int_Nx.198011.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198012.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198101.nc4    78
MERRA2_100.tavgM_2d_int_Nx.198102.nc4    84
MERRA2_100.tavgM_2d_int_Nx.198103.nc4    78
MERRA2_100.tavgM_2d_int_Nx.198104.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198105.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198106.nc4    74
MERRA2_100.tavgM_2d_int_Nx.198107.nc4    80
MERRA2_100.tavgM_2d_int_Nx.198108.nc4    82
MERRA2_100.tavgM_2d_int_Nx.198109.nc4    82
MERRA2_100.tavgM_2d_int_Nx.198110.nc4    90
MERRA2_100.tavgM_2d_int_Nx.19811

#### What is the total number of byte streams in the files?

In [64]:
x.sum()

447408

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [65]:
100 * (len(x.index)/x.sum() - 1)

-7.4790794979079482

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [66]:
before = merra2_tavg.Size.sum()
before

16079193101

In [67]:
after = merra2_tavg.drop_duplicates('Checksum')['Size'].sum()
after

15511764737

In [68]:
100 * (after/before - 1)

-3.5289604424534882

#### What is the size statistics for unique byte streams?

In [69]:
p = Histogram(merra2_tavg.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [70]:
merra2_tavg.drop_duplicates('Checksum')['Size'].describe()

count    413946.000000
mean      37472.918538
std        7710.131963
min          12.000000
25%       37175.000000
50%       39163.000000
75%       41451.000000
max       46620.000000
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [71]:
grp = merra2_tavg.groupby('File')

In [72]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    144.0
mean     197.0
std        0.0
min      197.0
25%      197.0
50%      197.0
75%      197.0
max      197.0
Name: Dataset, dtype: float64

#### Is every dataset in every file chunked?

In [73]:
grp = merra2_tavg.groupby(['File', 'Dataset'])

In [74]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [75]:
grp.UUID.count().describe()

count    28368.000000
mean        15.771574
std          1.836937
min          1.000000
25%         16.000000
50%         16.000000
75%         16.000000
max         16.000000
Name: UUID, dtype: float64

In [76]:
max_num_chunk = 16
p = Histogram(grp.UUID.count(), 
              bins=list(map(lambda x: x + 0.5, range(0, max_num_chunk+1))),
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(max_num_chunk+1)))
show(p)

## Analysis: `MERRA2_200.tavgM_2d_int_Nx`

Select only DMR++ entries for these files:

In [77]:
merra2_tavg = dmrpp[dmrpp.File.str.contains('MERRA2_200.tavgM_2d_int_Nx')]

In [78]:
merra2_tavg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335556 entries, 4098085 to 4433640
Data columns (total 8 columns):
File          335556 non-null object
Dataset       335556 non-null object
Chunk_Flag    335556 non-null bool
UUID          335556 non-null object
Checksum      335556 non-null object
Offset        335556 non-null int64
Size          335556 non-null int64
Type          335556 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 20.8+ MB


### Byte Streams

#### How many byte streams with unique content?

In [79]:
x = merra2_tavg.Checksum.value_counts()

In [80]:
len(x.index)

310419

#### Which are the Top 20 byte streams based on content?

In [81]:
x.sort_values(ascending=False).head(20)

d57d72206d7fb5fad96714288dd86f29    8412
f6c9ef6c28c5715b914e5d7194b9f131    2962
196be26f3d971e3c37190636133fee49     108
6ed95e16d4b2f755e1ff226faedac1d1     108
dc51ab56717a3cae07a2ef00aa155512     108
d5c95f4029f40a582961ae2413cb00cc       2
83add3e22cbd4ec64888b30ee72f3794       2
bd6e2de63e4de5e58521f6ac9fdae3ce       2
5da1281d1f9205b3ccfbebc152b892c3       2
c71d49d3513b7d388ade961dd0a3e61a       2
5f063fa5c5e8f88f299f47cecae3228a       2
9ccbbfa0de035a9d2c6c747eb0cc6f23       2
8cf2d6d08dd8c73574e6567b8180ed57       2
598e5fee7de733f1019bfd2374a8ead3       2
e4947f154599717e2a8c4a1eab14c424       2
24bc597205e3f4bf502685c01f2f3a8e       2
832cf1045a941de28066714253a6ee2c       2
9c53e04435ee429b14e69ea9ab5ca531       2
3aee5e4922e8d27faabb881e19bf2ec5       2
92e1a3376aeab08f8c3aac596e7fd88c       2
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [82]:
cksum = 'd57d72206d7fb5fad96714288dd86f29'

In [83]:
grp = merra2_tavg.groupby('Checksum')

Per dataset across all the files that have it:

In [84]:
grp.get_group(cksum)['Dataset'].value_counts()

/DQVDT_FIL        1296
/Var_DQIDT_FIL    1296
/Var_DOXDT_FIL    1296
/DQIDT_FIL        1296
/Var_DQVDT_FIL    1296
/DOXDT_FIL        1296
/PRECSN            124
/Var_PRECSN        124
/DQLDT_FIL          97
/Var_QTFILL         97
/QTFILL             97
/Var_DQLDT_FIL      97
Name: Dataset, dtype: int64

Per file including all its datasets:

In [85]:
grp.get_group(cksum)['File'].value_counts().sort_index()

MERRA2_200.tavgM_2d_int_Nx.199201.nc4    80
MERRA2_200.tavgM_2d_int_Nx.199202.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199203.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199204.nc4    82
MERRA2_200.tavgM_2d_int_Nx.199205.nc4    80
MERRA2_200.tavgM_2d_int_Nx.199206.nc4    72
MERRA2_200.tavgM_2d_int_Nx.199207.nc4    78
MERRA2_200.tavgM_2d_int_Nx.199208.nc4    80
MERRA2_200.tavgM_2d_int_Nx.199209.nc4    78
MERRA2_200.tavgM_2d_int_Nx.199210.nc4    76
MERRA2_200.tavgM_2d_int_Nx.199211.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199212.nc4    84
MERRA2_200.tavgM_2d_int_Nx.199301.nc4    92
MERRA2_200.tavgM_2d_int_Nx.199302.nc4    82
MERRA2_200.tavgM_2d_int_Nx.199303.nc4    76
MERRA2_200.tavgM_2d_int_Nx.199304.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199305.nc4    78
MERRA2_200.tavgM_2d_int_Nx.199306.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199307.nc4    80
MERRA2_200.tavgM_2d_int_Nx.199308.nc4    74
MERRA2_200.tavgM_2d_int_Nx.199309.nc4    76
MERRA2_200.tavgM_2d_int_Nx.199310.nc4    74
MERRA2_200.tavgM_2d_int_Nx.19931

#### What is the total number of byte streams in the files?

In [86]:
x.sum()

335556

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [87]:
100 * (len(x.index)/x.sum() - 1)

-7.4911490183456664

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [88]:
before = merra2_tavg.Size.sum()
before

12057760738

In [89]:
after = merra2_tavg.drop_duplicates('Checksum')['Size'].sum()
after

11630803129

In [90]:
100 * (after/before - 1)

-3.5409361512245274

#### What is the size statistics for unique byte streams?

In [91]:
p = Histogram(merra2_tavg.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [92]:
merra2_tavg.drop_duplicates('Checksum')['Size'].describe()

count    310419.000000
mean      37468.077434
std        7695.430475
min          12.000000
25%       37168.000000
50%       39165.000000
75%       41436.000000
max       46514.000000
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [93]:
grp = merra2_tavg.groupby('File')

In [94]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    108.0
mean     197.0
std        0.0
min      197.0
25%      197.0
50%      197.0
75%      197.0
max      197.0
Name: Dataset, dtype: float64

#### Is every dataset in every file chunked?

In [95]:
grp = merra2_tavg.groupby(['File', 'Dataset'])

In [96]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [97]:
grp.UUID.count().describe()

count    21276.000000
mean        15.771574
std          1.836947
min          1.000000
25%         16.000000
50%         16.000000
75%         16.000000
max         16.000000
Name: UUID, dtype: float64

In [98]:
max_num_chunk = 16
p = Histogram(grp.UUID.count(), 
              bins=list(map(lambda x: x + 0.5, range(0, max_num_chunk+1))),
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(max_num_chunk+1)))
show(p)

## Analysis: `MERRA2_300.tavgM_2d_int_Nx`

Select only DMR++ entries for these files:

In [99]:
merra2_tavg = dmrpp[dmrpp.File.str.contains('MERRA2_300.tavgM_2d_int_Nx')]

In [100]:
merra2_tavg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372840 entries, 6105013 to 6477852
Data columns (total 8 columns):
File          372840 non-null object
Dataset       372840 non-null object
Chunk_Flag    372840 non-null bool
UUID          372840 non-null object
Checksum      372840 non-null object
Offset        372840 non-null int64
Size          372840 non-null int64
Type          372840 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 23.1+ MB


### Byte Streams

#### How many byte streams with unique content?

In [101]:
x = merra2_tavg.Checksum.value_counts()

In [102]:
len(x.index)

344840

#### Which are the Top 20 byte streams based on content?

In [103]:
x.sort_values(ascending=False).head(20)

d57d72206d7fb5fad96714288dd86f29    9416
f6c9ef6c28c5715b914e5d7194b9f131    3354
196be26f3d971e3c37190636133fee49     120
dc51ab56717a3cae07a2ef00aa155512     120
6ed95e16d4b2f755e1ff226faedac1d1     120
ff58f5edc4d8a12de78dce452753632b       2
07de15dacd21c6f6b4926295d2931e62       2
1e80903059f254d6deb3fa7783b9c870       2
f52477bcb5cbf066f3e55a98def1b90e       2
f49e56bc46443023207af1bd6cfae879       2
de5e2b1667e0038f405e003185293c99       2
4a520d9ca23bae5ac74c69f07bd2026c       2
b6416c47fe40797984a6592b53bd74f2       2
4572b6dfe3d1de533a1e1a38b65ebc8a       2
dd357a04441158de3139f929a03172e9       2
7218cfb1cf90eceef72ad828b9124ac6       2
6b16bd6fadd818ea889f08eda71d65e9       2
e6c69cad32367bbf63de44350d6fb170       2
f109dc657be4f6684927ef9be540379c       2
b7121463ed8a78611ee65b15edc99783       2
Name: Checksum, dtype: int64

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [104]:
cksum = 'd57d72206d7fb5fad96714288dd86f29'

In [105]:
grp = merra2_tavg.groupby('Checksum')

Per dataset across all the files that have it:

In [106]:
grp.get_group(cksum)['Dataset'].value_counts()

/DQVDT_FIL        1440
/Var_DQIDT_FIL    1440
/Var_DOXDT_FIL    1440
/DQIDT_FIL        1440
/Var_DQVDT_FIL    1440
/DOXDT_FIL        1440
/PRECSN            138
/Var_PRECSN        138
/DQLDT_FIL         125
/Var_QTFILL        125
/QTFILL            125
/Var_DQLDT_FIL     125
Name: Dataset, dtype: int64

Per file including all its datasets:

In [107]:
grp.get_group(cksum)['File'].value_counts().sort_index()

MERRA2_300.tavgM_2d_int_Nx.200101.nc4    82
MERRA2_300.tavgM_2d_int_Nx.200102.nc4    88
MERRA2_300.tavgM_2d_int_Nx.200103.nc4    82
MERRA2_300.tavgM_2d_int_Nx.200104.nc4    78
MERRA2_300.tavgM_2d_int_Nx.200105.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200106.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200107.nc4    78
MERRA2_300.tavgM_2d_int_Nx.200108.nc4    82
MERRA2_300.tavgM_2d_int_Nx.200109.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200110.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200111.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200112.nc4    80
MERRA2_300.tavgM_2d_int_Nx.200201.nc4    80
MERRA2_300.tavgM_2d_int_Nx.200202.nc4    78
MERRA2_300.tavgM_2d_int_Nx.200203.nc4    78
MERRA2_300.tavgM_2d_int_Nx.200204.nc4    74
MERRA2_300.tavgM_2d_int_Nx.200205.nc4    76
MERRA2_300.tavgM_2d_int_Nx.200206.nc4    86
MERRA2_300.tavgM_2d_int_Nx.200207.nc4    84
MERRA2_300.tavgM_2d_int_Nx.200208.nc4    76
MERRA2_300.tavgM_2d_int_Nx.200209.nc4    84
MERRA2_300.tavgM_2d_int_Nx.200210.nc4    74
MERRA2_300.tavgM_2d_int_Nx.20021

#### What is the total number of byte streams in the files?

In [108]:
x.sum()

372840

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [109]:
100 * (len(x.index)/x.sum() - 1)

-7.5099238279154541

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [110]:
before = merra2_tavg.Size.sum()
before

13391794899

In [111]:
after = merra2_tavg.drop_duplicates('Checksum')['Size'].sum()
after

12918375774

In [112]:
100 * (after/before - 1)

-3.535143187082046

#### What is the size statistics for unique byte streams?

In [113]:
p = Histogram(merra2_tavg.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [114]:
merra2_tavg.drop_duplicates('Checksum')['Size'].describe()

count    344840.000000
mean      37461.941115
std        7690.765428
min          12.000000
25%       37184.000000
50%       39138.000000
75%       41427.000000
max       46690.000000
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [115]:
grp = merra2_tavg.groupby('File')

In [116]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    120.0
mean     197.0
std        0.0
min      197.0
25%      197.0
50%      197.0
75%      197.0
max      197.0
Name: Dataset, dtype: float64

#### Is every dataset in every file chunked?

In [117]:
grp = merra2_tavg.groupby(['File', 'Dataset'])

In [118]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [119]:
grp.UUID.count().describe()

count    23640.000000
mean        15.771574
std          1.836943
min          1.000000
25%         16.000000
50%         16.000000
75%         16.000000
max         16.000000
Name: UUID, dtype: float64

In [120]:
max_num_chunk = 16
p = Histogram(grp.UUID.count(), 
              bins=list(map(lambda x: x + 0.5, range(0, max_num_chunk+1))),
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(max_num_chunk+1)))
show(p)