In [43]:
import pandas as pd
from bokeh.charts import output_notebook, output_file, show, Scatter, Histogram, TimeSeries
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Range1d, HoverTool, ResizeTool, FixedTicker
output_notebook()

In [44]:
dmrpp = pd.read_csv('../../../dmrpp_stats.csv')

In [45]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 7 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
dtypes: bool(1), int64(2), object(4)
memory usage: 379.5+ MB


In [46]:
dmrpp.head(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
0,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/TropHeight_TqJ_D_max,True,8c3b0087-973a-4073-8215-8490fd26065c,d4a492ad98ae783145c9286cf69e5b8a,283428105,149944
1,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/CoarseCloudFrc_TqJ_D_ct,True,065860ce-ebd0-465a-8512-bf7d56557a58,0555723dc275296109273ecf6bb8fe64,294468920,112388
2,AIRS.2015.01.01.L3.RetStd_IR001.v6.0.11.0.G150...,/O3_VMR_TqJ_D_min,True,9752def8-fd3a-462f-9f86-700a2790b89d,bcaf632733e1ef9ad9a281589c02c4dc,301573840,233452


In [47]:
dmrpp.tail(3)

Unnamed: 0,File,Dataset,Chunk_Flag,UUID,Checksum,Offset,Size
8120475,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA1/Data Fields/Temperature,False,38c4ac4b-8f4e-414e-a6c3-5342bacd0a60,ac971f91f30b76bc1b481501b0a53542,40696,32
8120476,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA2/Data Fields/Temperature,False,1c73591f-e26c-402a-8429-0941e78a9708,d709add9a8fd30f7432e76d43def1494,40776,128
8120477,za_2_2d_yz.h5,/HDFEOS/ZAS/ZA1/Data Fields/Latitude,False,cf98febc-e0ea-4cac-ad5c-423e91ddbca8,6d8ca9f67d9914dd04b31533021b7e17,40680,16


Create a new column for file types:

In [48]:
def file_type(v):
    if v.startswith('AIRS'):
        return 'airs'
    elif v.startswith('MERRA2'):
        return 'merra'
    else:
        return 'sample'

In [49]:
dmrpp['Type'] = dmrpp['File'].map(file_type)

What we have now?

In [50]:
dmrpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8120478 entries, 0 to 8120477
Data columns (total 8 columns):
File          object
Dataset       object
Chunk_Flag    bool
UUID          object
Checksum      object
Offset        int64
Size          int64
Type          object
dtypes: bool(1), int64(2), object(5)
memory usage: 441.4+ MB


## How many records for each file type?

In [51]:
grp = dmrpp.groupby('Type')

In [52]:
grp.size()

Type
airs       569765
merra     7494823
sample      55890
dtype: int64

## Analysis: AIRS

Select only the AIRS data:

In [53]:
airs = dmrpp[dmrpp.Type == 'airs']

In [54]:
airs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569765 entries, 0 to 569764
Data columns (total 8 columns):
File          569765 non-null object
Dataset       569765 non-null object
Chunk_Flag    569765 non-null bool
UUID          569765 non-null object
Checksum      569765 non-null object
Offset        569765 non-null int64
Size          569765 non-null int64
Type          569765 non-null object
dtypes: bool(1), int64(2), object(5)
memory usage: 35.3+ MB


### Byte Streams

#### How many byte streams with unique content?

In [55]:
x = airs.Checksum.value_counts()

In [56]:
len(x.index)

330353

#### Which are the most frequent byte streams based on content?

In [57]:
x.sort_values(ascending=False)

c55324594a0787856d9b908fc7a7a201    49640
a43f0ca2084e297f3aa9763a828dcb0f     6570
7c90147b4fcb72404dd36c4b0d82051d     1460
13194f25768e29a4921fa9f6ba022342     1460
7b744463b12501af7f1b66dece5e41b4      730
61f33d4ee48f21fef8b6ce240dbf12c2      365
cce9f79f6073bdaf715f1a5a07bf72ec      365
19c03d94e619f2f3d8ad1428c6c1389c      365
f6da25808f22177cef49588cc08740ca      365
a71f8216cff7fca9d2c00e768589efdd      365
cd581cc833cabac662ad43499330bed6      365
e03cc92919bb584379ea825173440b1c      365
0b2396e0349f169f6f5e16650f4063c5      365
6c9db2946c92fcf6e0d9f8c3bb49d1ff      365
08676799dc49855f9d71198e0c49d1ec      365
5bbb0bd53711bae49dc9e3c6b37bebd6      365
4db7b39b740cd3f1ccec16a08015fdc3       15
4a43fcbd965c31d5d53f914568186e80       15
895b446d6200a12138b94b5e7eaa5e8b       15
6b476e480140f1a8dae02bf74653c238       15
d3f1e0985027c33be35851f75ec078d8       15
d722ddf83cd616de0a3ad763e8d2bfaa       15
5c496ceecf57bbbc31ec6f89ba67f172       15
2c05eeea67bc2452e063d8d74f680d62  

#### Where is the same byte stream stored?

Byte stream we are looking for:

In [58]:
cksum = 'c55324594a0787856d9b908fc7a7a201'

In [59]:
grp = airs.groupby('Checksum')

Per dataset across all AIRS files that have it:

In [60]:
grp.get_group(cksum)['Dataset'].value_counts()

/Temperature_MW_D_sdev    2920
/Temperature_MW_A         2920
/GPHeight_MW_D_min        2920
/Temperature_MW_D_max     2920
/GPHeight_MW_A_min        2920
/GPHeight_MW_A_max        2920
/Temperature_MW_A_max     2920
/Temperature_MW_D_min     2920
/GPHeight_MW_A_sdev       2920
/GPHeight_MW_D            2920
/GPHeight_MW_D_sdev       2920
/Temperature_MW_A_min     2920
/GPHeight_MW_A            2920
/Temperature_MW_D         2920
/GPHeight_MW_D_max        2920
/Temperature_MW_A_sdev    2920
/Emis_MW_D                 365
/Emis_MW_A                 365
/Emis_MW_D_sdev            365
/Emis_MW_D_max             365
/Emis_MW_A_sdev            365
/Emis_MW_A_max             365
/Emis_MW_D_min             365
/Emis_MW_A_min             365
Name: Dataset, dtype: int64

Per file including all its datasets:

In [61]:
grp.get_group(cksum)['File'].value_counts()

AIRS.2015.09.23.L3.RetStd_IR001.v6.0.31.0.G15281123841.nc.h5    136
AIRS.2015.08.03.L3.RetStd_IR001.v6.0.31.0.G15216225225.nc.h5    136
AIRS.2015.11.13.L3.RetStd_IR001.v6.0.31.0.G15318181838.nc.h5    136
AIRS.2015.12.06.L3.RetStd_IR001.v6.0.31.0.G15341175309.nc.h5    136
AIRS.2015.04.28.L3.RetStd_IR001.v6.0.31.0.G15201121931.nc.h5    136
AIRS.2015.05.18.L3.RetStd_IR001.v6.0.31.0.G15214034207.nc.h5    136
AIRS.2015.05.22.L3.RetStd_IR001.v6.0.31.0.G15225112057.nc.h5    136
AIRS.2015.08.25.L3.RetStd_IR001.v6.0.31.0.G15240170412.nc.h5    136
AIRS.2015.01.07.L3.RetStd_IR001.v6.0.11.0.G15008175227.nc.h5    136
AIRS.2015.03.12.L3.RetStd_IR001.v6.0.11.0.G15075113344.nc.h5    136
AIRS.2015.03.01.L3.RetStd_IR001.v6.0.11.0.G15075145757.nc.h5    136
AIRS.2015.06.13.L3.RetStd_IR001.v6.0.31.0.G15207022701.nc.h5    136
AIRS.2015.10.29.L3.RetStd_IR001.v6.0.31.0.G15303183201.nc.h5    136
AIRS.2015.03.30.L3.RetStd_IR001.v6.0.31.0.G15187110506.nc.h5    136
AIRS.2015.12.27.L3.RetStd_IR001.v6.0.31.0.G15363

#### What is the total number of byte streams in the AIRS files?

In [62]:
x.sum()

569765

#### What is the percentage reduction in number of S3 objects based on repeating checksums?

In [63]:
100 * (len(x.index)/x.sum() - 1)

-42.019429062859246

#### What is the percentage reduction in bytes stored in S3 based on repeating checksums?

In [64]:
before = airs.Size.sum()
before

114762412036

In [65]:
after = airs.drop_duplicates('Checksum')['Size'].sum()
after

80984748175

In [66]:
100 * (after/before - 1)

-29.432689032715899

#### What is the size statistics for unique byte streams?

In [67]:
p = Histogram(airs.drop_duplicates('Checksum')['Size']/1000, bins=50,
             xlabel='Byte Stream Size (kilobytes)')
show(p)

In [68]:
airs.drop_duplicates('Checksum')['Size'].describe()

count    3.303530e+05
mean     2.451461e+05
std      2.341771e+05
min      2.000000e+01
25%      8.419200e+04
50%      1.922770e+05
75%      2.363170e+05
max      1.122311e+06
Name: Size, dtype: float64

### Datasets

#### How many datasets per file?

In [69]:
grp = airs.groupby('File')

In [70]:
grp.Dataset.unique().apply(lambda l: len(l)).describe()

count    365.0
mean     777.0
std        0.0
min      777.0
25%      777.0
50%      777.0
75%      777.0
max      777.0
Name: Dataset, dtype: float64

#### Is every dataset in every AIRS file chunked?

In [71]:
grp = airs.groupby(['File', 'Dataset'])

In [72]:
grp.Chunk_Flag.agg(all).all()

True

#### How many byte streams per dataset?

In [73]:
grp.UUID.count().describe()

count    283605.000000
mean          2.009009
std           2.458655
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           8.000000
Name: UUID, dtype: float64

In [74]:
p = Histogram(grp.UUID.count(), bins=[.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
              xlabel='Number of byte streams per dataset',
              ylabel='Count(Dataset per File)')
p.xaxis[0].ticker = FixedTicker(ticks=list(range(9)))
show(p)