In [1]:
import chart_studio.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

df_d = pd.read_csv("days_data.csv")
df_m = pd.read_csv("minute_data.csv")
df_s = pd.read_csv("second_data.csv")

# Day
Each row is a single day

In [2]:
df_d.head(5)

Unnamed: 0.1,Unnamed: 0,count,day_of_week
0,0,8822189.0,Saturday
1,1,10653900.0,Sunday
2,2,12206610.0,Monday
3,3,11079840.0,Tuesday
4,4,11976070.0,Wednesday


In [3]:
df_d.iplot(y="count")

# Minute
Each row is a single minute

In [4]:
df_m.head(5)

Unnamed: 0.1,Unnamed: 0,count
0,0,3659
1,1,3465
2,2,3159
3,3,3128
4,4,3544


In [5]:
df_m.iplot(y="count")

# Second
Each row is a single second

In [6]:
df_s.head(5)

Unnamed: 0.1,Unnamed: 0,count
0,0,87
1,1,76
2,2,81
3,3,110
4,4,78


In [7]:
df_s.iplot(y="count")

# Example: Mutations on Per Minute Data

In [8]:
import datetime
import random
from itertools import cycle
import random
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Keep getting data from our df_m (minute) data in a circular way
# This allows us to generate data and keep repeating until we run out of days in our dataset
def get_circular_data():
    circular_list = cycle(df_m["count"].values)
    return circular_list

def generate_data(start, end, step, my_func):
    dt_first_timestamp_in_sec = int(start.timestamp())
    dt_first_timestamp_in_sec

    result = []
    while start < end:
        timestamp = start.strftime('%Y-%m-%d %H:%M:%S')
        result.append({"timestamp":timestamp,
                       "value": next(my_func)
                      })
        start += step
    return result

# Generate some data from Feb 1st 2019 to Feb 5th 2019
start = datetime.datetime(2019, 2, 1)
end = datetime.datetime(2019, 2, 5, 23, 59, 59)
step = datetime.timedelta(minutes=1) # By minutes (because we're using our minute dataset)


In [9]:
df = pd.DataFrame(generate_data(start,end,step,get_circular_data()))

In [10]:
df.iplot(y="value")

In [11]:
df.head()

Unnamed: 0,timestamp,value
0,2019-02-01 00:00:00,3659
1,2019-02-01 00:01:00,3465
2,2019-02-01 00:02:00,3159
3,2019-02-01 00:03:00,3128
4,2019-02-01 00:04:00,3544


In [12]:
# Latency is loosely related to traffic volume
#  Although good systems don't see an increase in latency when the traffic increases
def latency(value):
    latency = 500 # 100 ms is the default
    
    if value > 9000:
        latency = latency * (1.01+random.uniform(0, 1))
    if value > 11000:
        latency = latency * (1.09+random.uniform(0, 1))
    if value < 5000:
        latency = latency * (0.7+random.uniform(0, 1))
    
    return latency
    
# We don't want to see many database errors, so let's simulate rare occurences
def db_errors(value, host_num):
    if host_num == 1:
        1 if random.randint(0,100000) > 77777 else np.NaN
    if host_num == 2:
        1 if random.randint(0,100000) > 99998 else np.NaN
    if host_num == 3:
        1 if random.randint(0,1000000) > 999999 else np.NaN
    
    return 1 if random.randint(0,100) > 99 else np.NaN
    
    
df["http_500"] = df["value"]*.05 # Server Error
df["http_404"] = df["value"]*.1 # Page not found
df["http_200"] = df["value"]*5 # OK

df["login_success"] = df["value"]*.6
df["latency"] = df["value"].apply(lambda x: latency(x))


df["db_errors_host01"] = df["value"].apply(lambda x: db_errors(x,1))
df["db_errors_host02"] = df["value"].apply(lambda x: db_errors(x,2))
df["db_errors_host03"] = df["value"].apply(lambda x: db_errors(x,3))


In [13]:
df.head()

Unnamed: 0,timestamp,value,http_500,http_404,http_200,login_success,latency,db_errors_host01,db_errors_host02,db_errors_host03
0,2019-02-01 00:00:00,3659,182.95,365.9,18295,2195.4,361.151053,,,1.0
1,2019-02-01 00:01:00,3465,173.25,346.5,17325,2079.0,355.277811,,,
2,2019-02-01 00:02:00,3159,157.95,315.9,15795,1895.4,401.367103,,,
3,2019-02-01 00:03:00,3128,156.4,312.8,15640,1876.8,541.58918,,,
4,2019-02-01 00:04:00,3544,177.2,354.4,17720,2126.4,764.417887,,,


In [14]:
df.iplot(x="timestamp", y=["http_500", "http_404"])

In [15]:
df.iplot(x="timestamp", y="latency")

In [16]:
# We should less less logins per HTTP 200 since 200 OKs happen a lot
df.iplot(x="timestamp", y=["login_success", "http_200"])

In [17]:
df.index = pd.to_datetime(df["timestamp"])

# Down-sample every 60 seconds
resampled = df[["timestamp","db_errors_host01","db_errors_host02","db_errors_host03"]].resample("60T").sum().reset_index()
resampled.iplot(kind="scatter", mode="markers", x="timestamp")