In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from scipy.stats import lognorm
from pathlib import Path
import sys
import os

import pmdarima as pm
from pmdarima.model_selection import train_test_split

import process_data
import source.utilities as utilities 
import source.policy as policy

In [None]:
def process_files(num_chunks):
    import process_data
    days = range(1, 13)
    minute = 1440

    hashapp_lists = [set() for _ in range(num_chunks)]

    for i, day in enumerate(days):
        data = process_data.Dataset(range(day, day+1), range(day, day+1), minute)
        data = data.df_grouped_by_app.sum().reset_index()
        columns_to_select = ['HashApp'] + list(data.loc[:, '1':'1440'].columns)
        data = data[columns_to_select]

        n_rows = len(data)

        chunk_sizes = [n_rows // num_chunks] * num_chunks
        for idx in range(n_rows % num_chunks):
            chunk_sizes[idx] += 1

        start_idx = 0
        for chunk_idx, chunk_size in enumerate(chunk_sizes):
            end_idx = start_idx + chunk_size
            chunk = data.iloc[start_idx:end_idx]

            hashapp_values = set(chunk['HashApp'])

            if i == 0:
                hashapp_lists[chunk_idx] = hashapp_values
            else:
                hashapp_lists[chunk_idx] &= hashapp_values

            start_idx = end_idx

    return hashapp_lists


In [None]:
def process_and_concat_files(common_hashapps):

    os.makedirs('chunk_invocation_data', exist_ok=True)

    days = range(1, 13)
    minute = 1440
    num_chunks = len(common_hashapps)
    
    for day in days:
        data = process_data.Dataset(range(day, day+1), range(day, day+1), minute)
        data = data.df_grouped_by_app.sum().reset_index()
        columns_to_select = ['HashApp'] + list(data.loc[:, '1':'1440'].columns)
        data = data[columns_to_select]
        
        n_rows = len(data)
        
        chunk_sizes = [n_rows // num_chunks] * num_chunks
        for idx in range(n_rows % num_chunks):
            chunk_sizes[idx] += 1

        start_idx = 0
        for chunk_idx, chunk_size in enumerate(chunk_sizes):
            end_idx = start_idx + chunk_size
            chunk = data.iloc[start_idx:end_idx]

            hashapps_in_chunk = sorted(common_hashapps[chunk_idx])
            chunk = chunk[chunk['HashApp'].isin(hashapps_in_chunk)]

            chunk = chunk.set_index('HashApp').loc[hashapps_in_chunk].reset_index()

            col_mapping = {str(i): f'day{day}_{i}' for i in range(1, 1441)}
            chunk.rename(columns=col_mapping, inplace=True)

            chunk.set_index('HashApp', inplace=True)

            chunk_filename = f'chunk_{chunk_idx+1}.csv'
            file_path = os.path.join('chunk_invocation_data', chunk_filename)

            if day == 1:
                chunk.to_csv(file_path)
            else:
                if os.path.exists(file_path):
                    existing_chunk = pd.read_csv(file_path, index_col='HashApp')

                    combined_chunk = pd.concat([existing_chunk, chunk], axis=1)

                    combined_chunk.to_csv(file_path)
                else:
                    chunk.to_csv(file_path)

            start_idx = end_idx


In [None]:
num_chunks = 8
common_hashapps = process_files(num_chunks)

Parsing azurefunctions-dataset2019, days: 1, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


Parsing azurefunctions-dataset2019, days: 2, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]


Parsing azurefunctions-dataset2019, days: 3, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Parsing azurefunctions-dataset2019, days: 4, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]


Parsing azurefunctions-dataset2019, days: 5, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]


Parsing azurefunctions-dataset2019, days: 6, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


Parsing azurefunctions-dataset2019, days: 7, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]


Parsing azurefunctions-dataset2019, days: 8, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]


Parsing azurefunctions-dataset2019, days: 9, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Parsing azurefunctions-dataset2019, days: 10, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


Parsing azurefunctions-dataset2019, days: 11, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Parsing azurefunctions-dataset2019, days: 12, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


In [5]:
process_and_concat_files(common_hashapps)

Parsing azurefunctions-dataset2019, days: 1, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.62s/it]


Parsing azurefunctions-dataset2019, days: 2, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Parsing azurefunctions-dataset2019, days: 3, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]


Parsing azurefunctions-dataset2019, days: 4, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.67s/it]


Parsing azurefunctions-dataset2019, days: 5, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.64s/it]


Parsing azurefunctions-dataset2019, days: 6, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


Parsing azurefunctions-dataset2019, days: 7, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]


Parsing azurefunctions-dataset2019, days: 8, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Parsing azurefunctions-dataset2019, days: 9, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


Parsing azurefunctions-dataset2019, days: 10, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


Parsing azurefunctions-dataset2019, days: 11, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:02<00:00,  3.00s/it]


Parsing azurefunctions-dataset2019, days: 12, minutes: 1440


Parsing Data: 100%|██████████| 1/1 [00:03<00:00,  3.22s/it]
