In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, 'processed')
from utils import *

In [3]:
data_src = 'datasets/lease_info_uc'

In [4]:
df = pd.read_csv('%s.csv' % data_src)

Columns:
- Lease_id
- Project_id
- User_id
- Created_at: when the lease is created by the user
- Deleted_at: when the lease is deleted by the user (user may delete the lease before the scheduled lease expiration time)
- Start_on: when the lease starts
- End_on: when the lease ends (user may delete the lease before this time)
- Status: the final status of the lease (don’t trust this column. It may say “active”, but the lease is already terminated)
- Node_cnt: the number of nodes we give to the user
- Reserve_condition: what kind of nodes the user is asking for
- Reserve_count_range: the number of nodes the user is asking for (min-max)

In [5]:
df.shape

(30487, 11)

In [6]:
df.head(5)

Unnamed: 0,lease_id,project_id,user_id,created_at,deleted_at,start_on,end_on,status,node_cnt,reserve_conditions,reserve_count_range
0,c1568c33eec9d2a24aa97c868c2cf97b,6024816b7387ea63997c1c37abc3d338,a257de1fb61e44ae745ceb1467c3225e,2021-05-13 18:12:56,2021-05-24 17:29:26,2021-05-13 18:13:00,2021-05-20 18:12:00,deleted,1,"[""=="",""$node_type"",""compute_haswell""]",1-1
1,701d0083dc1199fc92309f55d5475210,a956639e292fd1679669a9507e4cd0c9,45b7d90a39ea685ec4fcba668a70cebd,2019-05-12 05:17:27,2019-05-12 05:31:30,2019-05-12 05:18:00,2019-05-12 05:47:00,active,1,"[""="", ""$node_type"", ""compute_skylake""]",1-1
2,83fb37cc0d211e4577597edb5f354652,eedcd4c64ea8cbbbf7b94258540171c2,d271a59b4551cc100896c3a4550b7f43,2016-06-24 00:06:17,2016-06-28 00:42:08,2016-06-24 00:08:00,2016-06-30 00:08:00,completed,1,,1-1
3,2e6619797ecfc077761a28f88c3e88ff,a956639e292fd1679669a9507e4cd0c9,45b7d90a39ea685ec4fcba668a70cebd,2019-08-28 03:02:15,2019-08-28 03:03:13,2019-08-28 03:03:00,2019-08-28 04:03:00,active,1,"[""="", ""$node_type"", ""compute_haswell""]",1-1
4,0674ff0b4b9eee6f5d0f13ab9643ab1f,a956639e292fd1679669a9507e4cd0c9,45b7d90a39ea685ec4fcba668a70cebd,2020-09-12 15:02:21,2020-09-12 15:03:19,2020-09-12 15:03:00,2020-09-12 16:03:00,active,1,"[""="", ""$node_type"", ""compute_haswell""]",1-1


In [7]:
df['lease_id'].unique().shape

(30487,)

In [8]:
import os

if not os.path.exists('processed/%s' % data_src):
    os.system('mkdir -p processed/%s' % data_src)

## Step 1. Select leases that were started successfully

In [9]:
df = df[~df['status'].isin(['pending', 'error', 'deleted'])]

## Step 2. Select leases that have specific node type requirements

In [10]:
from ast import literal_eval

# We assume users book lease based on node type only.
node_types = list(set([literal_eval(nt)[2] for nt in df['reserve_conditions'].dropna() if '$node_type' in literal_eval(nt)]))

In [11]:
import random
assign_nt = []
select_leases = []
for index, row in df.iterrows():
    if row['reserve_conditions'] is not np.nan:
        if '$node_type' in literal_eval(row['reserve_conditions']):
            assign_nt.append(literal_eval(row['reserve_conditions'])[2])
            select_leases.append(row['lease_id'])
    # else:
    #     # if the reserve_confition is NaN, assign a node type randomly
    #     assign_nt.append(node_types[random.randint(0, len(node_types)-1)])
    #     select_leases.append(row['lease_id'])

In [12]:
df = df[df['lease_id'].isin(select_leases)]
df['node_type'] = assign_nt
df.reset_index(drop=True, inplace=True)

In [13]:
df.groupby(by=['node_type']).count()['lease_id']

node_type
compute             1360
compute_haswell    14599
compute_skylake     2191
fpga                  21
gpu_rtx_6000         371
gpu_v100              24
storage              149
Name: lease_id, dtype: int64

In [14]:
df['reserve_conditions'].unique().shape

(19,)

## Step 3. Seperate on-deamd and in-advance requests

In [15]:
df.created_at = pd.to_datetime(df.created_at)
df.start_on = pd.to_datetime(df.start_on)
on_demand = df[(df.start_on - df.created_at) < '10M']
in_advance = df[(df.start_on - df.created_at) > '10M']

In [16]:
on_demand.shape

(18468, 12)

In [17]:
# most spikes are from the internal project
# to get more accurant on-demand user requests, 
# we exclude records belong to this project temporarily.
# internal_project_id = on_demand.groupby(by=['project_id'])['node_cnt'].sum().sort_values().index[-1]
# on_demand = on_demand[on_demand['project_id'] != internal_project_id]
# on_demand['node_cnt'].plot()

In [18]:
in_advance.shape

(247, 12)

## Step 4. Save data by node type and request type

In [19]:
# save node types in seperate files
for nt in node_types:
    on_demand_nt = on_demand[on_demand['node_type'] == nt]
    on_demand_nt.to_csv('processed/%s/on_demand/%s.csv' % (data_src, nt), index=None)
    in_advance_nt = in_advance[in_advance['node_type'] == nt]
    in_advance_nt.to_csv('processed/%s/in_advance/%s.csv' % (data_src, nt), index=None)

In [20]:
from matplotlib.pylab import rcParams
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns

def plot_lease(data):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data['start_on'], y=data['node_cnt'], mode='lines', name=nt))
    fig.update_layout(showlegend=True)
    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
        paper_bgcolor="LightSteelBlue",
    )
    fig.show()

## TACC Data

In [21]:
import os
import pandas as pd
import numpy as np

node_types = os.listdir('processed/lease_info_tacc')
node_types = [x for x in node_types if '.csv' in x]
nt = 'compute_haswell.csv'

In [22]:
df = pd.read_csv('processed/lease_info_tacc/on_demand/%s' % nt)
df.sort_values(by=['start_on'], inplace=True)
plot_lease(df)
tacc_on_demand = df.copy()

In [23]:
df = pd.read_csv('processed/lease_info_tacc/in_advance/%s' % nt)
df.sort_values(by=['start_on'], inplace=True)
plot_lease(df)

## UC Data

In [24]:
import os
import pandas as pd
import numpy as np

node_types = os.listdir('processed/lease_info_uc')
node_types = [x for x in node_types if '.csv' in x]

In [25]:
df = pd.read_csv('processed/lease_info_uc/on_demand/%s' % nt)
df.sort_values(by=['start_on'], inplace=True)
plot_lease(df)
uc_on_demand = df.copy()

In [26]:
df = pd.read_csv('processed/lease_info_uc/in_advance/%s' % nt)
df.sort_values(by=['start_on'], inplace=True)
plot_lease(df)

## Concat TACC and UC Data

In [27]:
df = pd.concat([tacc_on_demand, uc_on_demand], axis=0)
df.shape

(30752, 12)

In [28]:
df.sort_values(by=['start_on'], inplace=True)
df.set_index(['start_on'], inplace=True)
plot_trace(df, ['node_cnt'])