In [1]:
import pandas as pd
import numpy as np
import itertools as it
import random

### Generate Mocking Data

For better illustration:

In [2]:
def gen_mock_zone_list():
    cardinal_direction_base = ['north', 'south', 'east', 'west']
    cardinal_direction_derivative = [ a+b for a,b in it.product(cardinal_direction_base[:2], cardinal_direction_base[-2:]) ]
    cardinal_direction = cardinal_direction_base + cardinal_direction_derivative

    region_section = ['a', 'b']
    region_sub_section = [[''], ['1'], ['1', '2']]

    prefix = 'ap'
    
    return [ f'{prefix}-{card}-{str(subreg) + reg}'
            for card in cardinal_direction
            for subreg in random.choice(region_sub_section)
            for reg in region_section
           ]

In [3]:
def get_ecs_category_list():
    return """
    CPU_CREDIT
    CPU_EXCLUSIVE
    CPU_EXCLUSIVE(ne)
    CPU_SHARE        
    """.split()

In [4]:
def get_months_list(from_year=2017, to_year=2018,
                    from_yearmonth=None, to_yearmonth=None,
                    missing_rate=0):
    
    def gen_month_seq(from_year, to_year):
            year = list(range(from_year, to_year+1))
            month = list(range(1, 13))
            return [ f'{y:4d}{m:02d}'
                    for y in year
                    for m in month
                   ]
        
    if from_yearmonth is not None and to_yearmonth is not None:
        month_list = gen_month_seq(int(from_yearmonth[:-2]), int(to_yearmonth[:-2]))
        month_list = np.array(month_list)
        month_list[ (month_list >= from_yearmonth) & (month_list <= from_yearmonth)]
        month_list = month_list.tolist()
    else:
        month_list = gen_month_seq(from_year, to_year)
    
    if missing_rate == 0:
        return month_list
    else:
        return sorted(random.sample(month_list, int(len(month_list)*(1-missing_rate))))

In [5]:
def get_mock_data(missing_rate=.2, ncpu_core_range=(1,64), grouped=False):
    zone_list = gen_mock_zone_list()
    ncpu_core_list = random.randint(*ncpu_core_range)
    ecs_category_list = get_ecs_category_list()
    df = pd.DataFrame(columns=['az_no', 'ecs_category_3_en', 'Week', 'n_cores'])
    i = 0
    for az in zone_list:
        for cat in ecs_category_list:
            for week in get_months_list(missing_rate=random.uniform(0, missing_rate)):
                df.loc[i] = [az, cat, week, random.randint(*ncpu_core_range)]
                i += 1
    if grouped:
        return df.set_index(['az_no', 'ecs_category_3_en', 'Week'])
    return df

In [6]:
df = get_mock_data(grouped=True)

In [7]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_cores
az_no,ecs_category_3_en,Week,Unnamed: 3_level_1
ap-north-a,CPU_CREDIT,201701,20
ap-north-a,CPU_CREDIT,201702,62
ap-north-a,CPU_CREDIT,201703,63
ap-north-a,CPU_CREDIT,201704,26
ap-north-a,CPU_CREDIT,201705,44
ap-north-a,CPU_CREDIT,201706,37
ap-north-a,CPU_CREDIT,201707,3
ap-north-a,CPU_CREDIT,201708,31
ap-north-a,CPU_CREDIT,201709,46
ap-north-a,CPU_CREDIT,201710,40


### Fill Missing "Dates" inplace


In [8]:
df_plain = df.reset_index()

zone_cat = df_plain.groupby(['az_no', 'ecs_category_3_en']).groups.keys()

week_range = (df_plain['Week'].min(), df_plain['Week'].max())
full_week_list = get_months_list(from_yearmonth=week_range[0], to_yearmonth=week_range[1])

full_index = [ (zone, cat, week) 
             for zone, cat in zone_cat
             for week in full_week_list ]

In [9]:
df_full = df.reindex(full_index).fillna(0)

In [10]:
( df_full.shape, df.shape ) 

((1920, 1), (1679, 1))

In [11]:
df_full

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_cores
az_no,ecs_category_3_en,Week,Unnamed: 3_level_1
ap-east-1a,CPU_CREDIT,201701,21
ap-east-1a,CPU_CREDIT,201702,33
ap-east-1a,CPU_CREDIT,201703,14
ap-east-1a,CPU_CREDIT,201704,24
ap-east-1a,CPU_CREDIT,201705,45
ap-east-1a,CPU_CREDIT,201706,7
ap-east-1a,CPU_CREDIT,201707,14
ap-east-1a,CPU_CREDIT,201708,0
ap-east-1a,CPU_CREDIT,201709,11
ap-east-1a,CPU_CREDIT,201710,17
