In [1]:
import pandas as pd
from bokeh.charts import output_notebook, output_file, show, Scatter, Histogram, TimeSeries
from bokeh.plotting import figure
from bokeh.models import Range1d, HoverTool, ResizeTool
output_notebook()

Read the log data and replace column values of `-` with `None`:

In [2]:
s3l = pd.read_csv('../../logs/Arch1/Run01/Arch1-Run01 - S3 Log.csv')
s3l.replace(['-'], [None]);

Column names:

In [3]:
s3l.columns

Index(['Bucket_Owner', 'Bucket', 'Time', 'Remote_IP', 'Requester',
       'Request_ID', 'Operation', 'Key', 'HTTP_method', 'Request_URI',
       'HTTP_status', 'Error_Code', 'Bytes_Sent', 'Object_Size',
       'Total_Time_ms', 'Turn_Around_Time_ms', 'Referrer', 'User_Agent',
       'Version_Id'],
      dtype='object')

Convert the Time column to datetime objects:

In [4]:
s3l['Time'] = pd.to_datetime(s3l['Time'])

Remove the Bucket_Owner and Bucket columns:

In [5]:
s3l.drop(['Bucket', 'Bucket_Owner'], axis=1, inplace=True)

In [6]:
s3l.columns

Index(['Time', 'Remote_IP', 'Requester', 'Request_ID', 'Operation', 'Key',
       'HTTP_method', 'Request_URI', 'HTTP_status', 'Error_Code', 'Bytes_Sent',
       'Object_Size', 'Total_Time_ms', 'Turn_Around_Time_ms', 'Referrer',
       'User_Agent', 'Version_Id'],
      dtype='object')

How many rows are there:

In [7]:
len(s3l.index)

2053

Sort rows based on column Time:

In [8]:
s3l.sort_values(by='Time', ascending=True);

## Data Cleanup

Search for any log entry that indicates some sort of error.

What are the different HTTP status codes:

In [9]:
s3l['HTTP_status'].unique()

array([200, 403])

What are the different AWS S3 system error codes:

In [10]:
s3l.Error_Code.unique()

array(['-', 'AccessDenied', 'InternalError'], dtype=object)

Find the _bad_ S3 requests:

In [11]:
s3l.loc[(s3l['HTTP_status'] == 403) | (s3l['Error_Code'] != '-'), 
        ['HTTP_status', 'Error_Code', 'Key', 'HTTP_method', 'Request_URI', 'User Agent']]

Unnamed: 0,HTTP_status,Error_Code,Key,HTTP_method,Request_URI,User Agent
203,403,AccessDenied,airs/AIRS.2015.07.22.L3.RetStd_IR001.v6.0.31.0...,GET,/airs/AIRS.2015.07.22.L3.RetStd_IR001.v6.0.31....,
1153,200,InternalError,merra2/MERRA2_300.tavgM_2d_int_Nx.200810.nc4,GET,/merra2/MERRA2_300.tavgM_2d_int_Nx.200810.nc4,
1890,403,AccessDenied,airs/AIRS.2015.07.22.L3.RetStd_IR001.v6.0.31.0...,GET,/airs/AIRS.2015.07.22.L3.RetStd_IR001.v6.0.31....,


Remove bad S3 log entries:

In [12]:
s3l.drop(s3l[(s3l['HTTP_status'] == 403) | (s3l['Error_Code'] != '-')].index, 
         inplace=True)

Number of rows now:

In [13]:
len(s3l.index)

2050

## General Data Exploration

Convert the Turn_Around_Time_ms column to numbers (I don't why it is not in the first place):

In [14]:
s3l['Turn_Around_Time_ms'] = pd.to_numeric(s3l['Turn_Around_Time_ms'])

Calculate transfer rate for each S3 request as a new column. The formula is:
$$0.001 * \frac{Bytes\_Sent}{Total\_Time\_ms - Turn\_Around\_Time\_ms}\,\,\rm{MBytes/s}$$

In [15]:
s3l['Trans_Rate_MB/s'] = \
    0.001 * s3l['Bytes_Sent'] / (s3l['Total_Time_ms'] - s3l['Turn_Around_Time_ms'])

User agent list:

In [16]:
s3l.User_Agent.unique()

array(['libcurl/7.19.7 NSS/3.21 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2'], dtype=object)

For Architecture \#1, the number of bytes sent and object size should be the same for all S3 requests:

In [17]:
s3l.Object_Size = pd.to_numeric(s3l.Object_Size)
(s3l.Bytes_Sent == s3l.Object_Size).all()

True

Reduction in the number of bytes pulled out of S3 compared to the original object sizes (it should be __zero__ for Architecture \#1):

In [18]:
def s3_bytes_savings(df):
    """Calculate the total percentage reduction in the bytes returned from S3 compared
       to the original object sizes.
       
       df: Input pandas DataFrame.
    """
    return (df['Bytes_Sent']/df['Object_Size'] - 1).sum() * 100

In [19]:
s3_bytes_savings(s3l)

0.0

## Analisys of All Log Entries

In [20]:
p = figure(x_axis_type='datetime', toolbar_location='above')
p.xaxis.axis_label = 'Time of S3 request'
p.yaxis.axis_label = 'Total S3 request time [ms]'
p.segment(x0=s3l['Time'], y0=([0] * len(s3l['Total_Time_ms'])), 
          x1=s3l['Time'], y1=s3l['Total_Time_ms'], 
          line_alpha=0.5)
p.circle(s3l['Time'], s3l['Total_Time_ms'])
p.add_tools(ResizeTool())
show(p)

Descriptive stats for several columns:

In [21]:
s3l['Bytes_Sent'].describe() 

count    2.050000e+03
mean     1.870752e+08
std      9.768751e+07
min      1.125193e+08
25%      1.128652e+08
50%      1.129954e+08
75%      3.172962e+08
max      3.236127e+08
Name: Bytes_Sent, dtype: float64

In [22]:
s3l['Total_Time_ms'].describe() 

count     2050.000000
mean      4560.855122
std       3867.581576
min       1505.000000
25%       2020.000000
50%       3343.000000
75%       5635.250000
max      73015.000000
Name: Total_Time_ms, dtype: float64

In [23]:
s3l['Turn_Around_Time_ms'].describe()

count    2050.000000
mean      192.604390
std       147.156139
min        55.000000
25%       120.000000
50%       161.000000
75%       227.000000
max      3908.000000
Name: Turn_Around_Time_ms, dtype: float64

In [24]:
s3l['Trans_Rate_MB/s'].describe()

count    2050.000000
mean       52.153919
std        17.862780
min         4.422471
25%        38.980464
50%        53.841794
75%        68.009022
max        80.771730
Name: Trans_Rate_MB/s, dtype: float64

In [25]:
p = Histogram(s3l['Total_Time_ms'], bins=100,
              xlabel='Total S3 response time [ms]', ylabel='Count(S3 requests)')
p.x_range = Range1d(0, 25000)
show(p)

In [26]:
p = Histogram(s3l['Turn_Around_Time_ms'], bins=50,
              xlabel='Turn-around S3 time [ms]', ylabel='Count(S3 requests)')
# p.x_range = Range1d(0, 25000)
show(p)

In [27]:
tooltips=[('Key', '@Key')]
p = Scatter(s3l, x='Bytes_Sent', y='Total_Time_ms',
            xlabel='Bytes Sent [bytes]', ylabel='Total S3 response time [ms]',
            tooltips=tooltips)
p.add_tools(ResizeTool())
show(p)

In [28]:
tooltips=[('Key', '@Key')]
p = Scatter(s3l, x='Bytes_Sent', y='Turn_Around_Time_ms',
            xlabel='Bytes Sent [bytes]', ylabel='Turn-around S3 response time [ms]',
            tooltips=tooltips)
p.add_tools(ResizeTool())
show(p)

In [29]:
tooltips=[('Key', '@Key'), ('Size', '@Object_Size')]
tot_time_q75 = s3l.Total_Time_ms.quantile(0.75)
idx = s3l.Total_Time_ms > tot_time_q75
p = Scatter(s3l.loc[idx, ['Total_Time_ms', 'Turn_Around_Time_ms', 'Key', 'Object_Size']],
            x='Total_Time_ms', y='Turn_Around_Time_ms',
            xlabel='Total S3 response time [ms]', ylabel='Turnaround S3 time [ms]',
            tooltips=tooltips,
            toolbar_location='above')
p.add_tools(ResizeTool())
show(p)

In [30]:
show(Histogram(s3l['Trans_Rate_MB/s'], bins=50,
               xlabel='Transfer Rate [MB/s]', ylabel='Count(S3 requests)'))

## Splitting Log Entries

The log entries will be split based on the product: AIRS and MERRA2.

In [31]:
airs = s3l[s3l.Key.str.contains('^airs/')]

In [32]:
len(airs.index)

734

In [33]:
merra = s3l[s3l.Key.str.contains('^merra2/')]

In [34]:
len(merra.index)

1316

## Analysis of AIRS Files Log Entries

In [35]:
airs.Bytes_Sent.describe()

count    7.340000e+02
mean     3.174969e+08
std      4.237050e+06
min      2.976742e+08
25%      3.166987e+08
50%      3.185693e+08
75%      3.197637e+08
max      3.236127e+08
Name: Bytes_Sent, dtype: float64

In [36]:
airs.Total_Time_ms.describe()

count      734.000000
mean      7669.535422
std       4618.254771
min       4119.000000
25%       5018.750000
50%       6215.000000
75%       8581.000000
max      73015.000000
Name: Total_Time_ms, dtype: float64

In [37]:
airs['Turn_Around_Time_ms'].describe()

count     734.000000
mean      192.004087
std       177.829973
min        59.000000
25%       117.250000
50%       155.000000
75%       226.000000
max      3908.000000
Name: Turn_Around_Time_ms, dtype: float64

In [38]:
airs['Trans_Rate_MB/s'].describe()

count    734.000000
mean      50.631300
std       16.725851
min        4.422471
25%       38.254313
50%       52.659161
75%       65.234246
max       74.527189
Name: Trans_Rate_MB/s, dtype: float64

In [39]:
tooltips=[('Key', '@Key')]
p = Scatter(airs, x='Bytes_Sent', y='Total_Time_ms', toolbar_location='above',
            xlabel='Bytes Sent [bytes]', ylabel='Total S3 response time [ms]',
            title='AIRS Files Only',
            tooltips=tooltips)
show(p)

In [40]:
show(Histogram(airs['Trans_Rate_MB/s'], bins=50, toolbar_location='above',
               xlabel='Transfer Rate [MB/s]', ylabel='Count(S3 requests)',
               title='AIRS Files Only'))

## Analysis of MERRA2 Files Log Entries

In [41]:
merra.Bytes_Sent.describe()

count    1.316000e+03
mean     1.143324e+08
std      8.333033e+06
min      1.125193e+08
25%      1.128267e+08
50%      1.129047e+08
75%      1.129861e+08
max      1.626462e+08
Name: Bytes_Sent, dtype: float64

In [42]:
merra.Total_Time_ms.describe()

count     1316.000000
mean      2826.986322
std       1736.678528
min       1505.000000
25%       1799.750000
50%       2294.000000
75%       3128.500000
max      21984.000000
Name: Total_Time_ms, dtype: float64

In [43]:
merra['Turn_Around_Time_ms'].describe()

count    1316.000000
mean      192.939210
std       126.942738
min        55.000000
25%       123.000000
50%       163.000000
75%       228.000000
max      1976.000000
Name: Turn_Around_Time_ms, dtype: float64

In [44]:
merra['Trans_Rate_MB/s'].describe()

count    1316.000000
mean       53.003160
std        18.418088
min         5.261499
25%        39.834899
50%        54.322188
75%        69.665017
max        80.771730
Name: Trans_Rate_MB/s, dtype: float64

In [45]:
tooltips=[('Key', '@Key')]
p = Scatter(merra, x='Bytes_Sent', y='Total_Time_ms', toolbar_location='above',
            xlabel='Bytes Sent [bytes]', ylabel='Total S3 response time [ms]',
            title='MERRA2 Files Only',
            tooltips=tooltips)
show(p)

In [46]:
show(Histogram(merra['Trans_Rate_MB/s'], bins=50, toolbar_location='above',
               xlabel='Transfer Rate [MB/s]', ylabel='Count(S3 requests)',
               title='MERRA2 Files Only'))