### Setup:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get update

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Waiting for h                                                                               Hit:4 https://developer.download.nvidia.com/comp

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"


from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, minute, concat, date_trunc, udf, col, rand, lit, to_timestamp, count, avg
from pyspark.sql import functions as sparkFunctions
from pathlib import Path
from pyspark.sql.types import StructField,  StructType, IntegerType, FloatType, TimestampType, StringType, BooleanType, LongType


path = Path("/content/drive/My Drive/Colab Notebooks/ds/")

conf = SparkConf().setAppName("DSIngestor").setMaster("local[*]").set('spark.driver.memory', '22g')
sc = SparkContext(conf=conf)


spark = SparkSession(sc)
spark.sparkContext.setLogLevel("ERROR")

In [2]:
!pip install torchviz
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from torchviz import make_dot
from torch.nn import functional as F
import random
from collections import defaultdict
import time, datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


# if gpu is to be used
use_cuda = torch.cuda.is_available()

device = torch.device("cuda:0" if use_cuda else "cpu")
Tensor = torch.Tensor
LongTensor = torch.LongTensor

Collecting torchviz
[?25l  Downloading https://files.pythonhosted.org/packages/8f/8e/a9630c7786b846d08b47714dd363a051f5e37b4ea0e534460d8cdfc1644b/torchviz-0.0.1.tar.gz (41kB)
[K     |████████                        | 10kB 17.7MB/s eta 0:00:01[K     |████████████████                | 20kB 24.2MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 23.4MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 19.7MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.9MB/s 
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25l[?25hdone
  Created wheel for torchviz: filename=torchviz-0.0.1-cp36-none-any.whl size=3522 sha256=c678443347370dce26a429e0d5d7a64e467e9054ad06bd3d235bbd4ad3b35e17
  Stored in directory: /root/.cache/pip/wheels/2a/c2/c5/b8b4d0f7992c735f6db5bfa3c5f354cf36502037ca2b585667
Successfully built torchviz
Installing collected packages: torchviz
Successfully installed torchviz-0.0.1


### Data Load:
___________

#### Scaler data generation:

In [None]:
schemaFields = StructType([ StructField("region", StringType(), True),
                            StructField("channel", StringType(), True),
                            StructField("session_id", StringType(), True),
                            StructField("chunk_quality", LongType(), True),
                            StructField("mean_payload_per_req", FloatType(), True),
                            StructField('req_count', IntegerType(), True),
                            StructField("windowstart", TimestampType(), True)
                          ])

requestDS = spark.read.schema(schemaFields).csv("/content/drive/My Drive/Colab Notebooks/ds/requestDS")

requestDS = requestDS.drop()

requestDS = requestDS.na.drop()


mean_payload_per_req_ds = requestDS.select('mean_payload_per_req').toPandas()

path = "/content/drive/My Drive/Colab Notebooks/ds/"

mean_payload_per_req_ds.to_csv(path+'mean_payload_per_req.csv')

mean_reqnum_per_sess_15sec_ds = requestDS.select('req_count').toPandas()

path = "/content/drive/My Drive/Colab Notebooks/ds/"

mean_reqnum_per_sess_15sec_ds.to_csv(path+'mean_reqnum_per_sess_15sec_ds.csv')


path = "/content/drive/My Drive/Colab Notebooks/ds/"

ds = pd.read_csv(path+'requests25.csv')

ds['windowstart'] = pd.to_datetime(ds['windowstart'])

ds['mean_payload_per_timestep'] = ds['mean_payload_per_req']*ds['req_count']

ds['mean_payload_per_timestep'].to_csv(path+'mean_payload_per_timestep')

#### Avoidable (Checkpointed):

In [None]:
schemaFields = StructType([ StructField("region", StringType(), True),
                            StructField("channel", StringType(), True),
                            StructField("session_id", StringType(), True),
                            StructField("chunk_quality", LongType(), True),
                            StructField("mean_payload_per_req", FloatType(), True),
                            StructField('req_count', IntegerType(), True),
                            StructField("windowstart", TimestampType(), True)
                          ])

requestDS = spark.read.schema(schemaFields).csv("/content/drive/My Drive/Colab Notebooks/ds/requestDS")

requestDS = requestDS.drop()

requestDS = requestDS.na.drop()



channel_mapping = {}

i = 0

for a in ['944', '666', '940', '938', '685', '703', '3', '933', '697', '815',
          '1868', '943', '932', '702', '694', '936', '1839', '822', '941',
          '688', '852', '821', '1869', '701', '2663', '945', '668', '1600',
          '705', '934', '704', '875', '810', '706', '939', '942', '2',
          '1662', '1840', '937', '699']:

  channel_mapping[a] = i

  i += 1



region_mapping = {}

j = 0

for region in ['RM','NA','MI','BO']:

  region_mapping[region] = j

  j += 1



def map_and_checkpoint_ds(ds, name):

  ds = ds.toPandas()

  ds = ds.replace({'channel': channel_mapping})

  ds = ds.replace({'region': region_mapping})

  '''
  Save to CSV
  '''
  path = "/content/drive/My Drive/Colab Notebooks/RL/"

  # pandas_25_00_6.to_csv(path+'requests_26072017_00_06.csv')

  ds.to_csv(path+name+'.csv')


In [None]:
# sould be more than 13 million records O.o [only for 1 day]
from_00_to_6 =  requestDS\
                  .filter(sparkFunctions.col('windowstart') \
                          < sparkFunctions.lit(datetime.datetime.strptime("2018-07-25 6:00:00", '%Y-%m-%d %H:%M:%S')))

pandas_25_00_6 = from_00_to_6.toPandas()

pandas_25_00_6 = pandas_25_00_6.replace({'channel': channel_mapping})

pandas_25_00_6 = pandas_25_00_6.replace({'region': region_mapping})


'''
Save to CSV
'''
path = "/content/drive/My Drive/Colab Notebooks/RL/"

pandas_25_00_6.to_csv(path+'requests_26072017_00_06.csv')

In [None]:
map_and_checkpoint_ds(requestDS, 'requests_25072018')

#### Checkpoint:
________

In [None]:
path = "/content/drive/My Drive/cleanDatasetsCDN/network/live/"

#ds_24 = pd.read_csv(path+'requests24.csv')
ds_25 = pd.read_csv(path+'requests25.csv')
ds_26 = pd.read_csv(path+'requests26.csv')
ds_27 = pd.read_csv(path+'requests27.csv')
ds_28 = pd.read_csv(path+'requests28.csv')
ds_29 = pd.read_csv(path+'requests29.csv')
# ds_30 = pd.read_csv(path+"requests30.csv") DAMAGED. TO FIX

ds = pd.concat([ds_25,ds_26,ds_27,ds_28,ds_29],ignore_index=True)
ds['windowstart'] = pd.to_datetime(ds['windowstart'])

In [None]:
week_ds = ds.groupby(['region','channel','session_id', 'windowstart']).agg(max_br=('chunk_quality', 'max'), mean_payload_per_req=('mean_payload_per_req', 'mean'), req_count=('req_count', 'sum'))

week_ds = week_ds.reset_index()

week_ds['windowstart'] = pd.to_datetime(week_ds['windowstart'])

week_ds['mean_payload_per_timestep'] = week_ds['mean_payload_per_req']  * week_ds['req_count']

In [5]:
path = "/content/drive/My Drive/cleanDatasetsCDN/network/live/"

ds_29 = pd.read_csv(path+'requests29.csv')

ds_29 = ds_29.groupby(['region','channel','session_id', 'windowstart']).agg(max_br=('chunk_quality', 'max'), mean_payload_per_req=('mean_payload_per_req', 'mean'), req_count=('req_count', 'sum'))

ds_29 = ds_29.reset_index()

ds_29['windowstart'] = pd.to_datetime(ds_29['windowstart'])

ds_29['mean_payload_per_timestep'] = ds_29['mean_payload_per_req']  * ds_29['req_count']

#### Assigner DS: OLD VERSION

In [None]:
def reset_time_framing(ds):

  start_time, stop_time = \
  ds['windowstart'].min(), ds['windowstart'].max()

  current_time = start_time

  return start_time, stop_time, current_time



def get_sessions_to_delete():

  sessions_to_delete = []

  sessions = list(sessions_dict.keys())

  ended_sessions = 0

  for session in sessions:

    sessions_dict[session]['TTL'] -= 1

    if sessions_dict[session]['TTL'] == 0:

      sessions_to_delete.append(session)

      del sessions_dict[session]

      ended_sessions += 1

  return sessions_to_delete



def get_incoming_reqs(ds):
    '''
    Returns the new session requests.

    '''

    timesteps = 0

    current_time_frame_ds = \
      ds[ds['windowstart'] >= current_time]\
      [ds['windowstart'] < (current_time + delta_t)]

    old_reqs = 0

    new_sess_reqs = []

    for index, row in current_time_frame_ds.iterrows():

      if row['session_id'] not in sessions_dict.keys():

        new_sess_reqs.append({  'session_id' : row['session_id'],
                                'client_cluster' : row['region'],
                                'channel': row['channel'],
                                'br' : row['max_br'],
                                'mean_req_count' : row['req_count'],
                                'mean_payload_per_req': row['mean_payload_per_req'],
                                'mean_payload_per_timestep': row['mean_payload_per_req'] * row['req_count']
                                })
        
        sessions_dict[row['session_id']] = \
                                { 
                                'TTL': session_ttl_timesteps,
                                }

      else:

        old_reqs += 1

        sessions_dict[row['session_id']]['TTL'] = session_ttl_timesteps


    return { 'new_sess' : new_sess_reqs, 'sess_to_delete' : get_sessions_to_delete()}

In [None]:
from tqdm.notebook import tqdm as tqdmsito

start_time, stop_time, current_time = reset_time_framing(week_ds)

sessions_dict = {}

session_ttl_timesteps = 10

delta_t = datetime.timedelta(seconds = 15)


optimized_ds = []


pbar = tqdmsito(total = 28800)

while current_time <= stop_time:

  optimized_ds.append(get_incoming_reqs(week_ds))

  current_time = current_time + delta_t

  print(current_time)
  
  pbar.update(1)

pbar.close()

In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( path + "optimized_request_ds24.txt", "wb") as fp:   #Pickling

  pickle.dump(optimized_ds, fp)


### Assigner DS: NEW OPTMIZED VERSION:

In [None]:
def get_sessions_to_delete():

  sessions_to_delete = []

  sessions = list(sessions_dict.keys())

  for session in sessions:

    sessions_dict[session]['TTL'] -= 1

    if sessions_dict[session]['TTL'] == 0:

      sessions_to_delete.append(session)

      del sessions_dict[session]

  return sessions_to_delete



def get_incoming_reqs(current_time_frame_ds):
    '''
    Returns the new session requests.

    '''
    new_sess_reqs = []

    for index, row in current_time_frame_ds.iterrows():

      if row['session_id'] not in sessions_dict.keys():

        new_sess_reqs.append({  'session_id' : row['session_id'],
                                'client_cluster' : row['region'],
                                'channel': row['channel'],
                                'br' : row['max_br'],
                                'mean_req_count' : row['req_count'],
                                'mean_payload_per_req': row['mean_payload_per_req'],
                                'mean_payload_per_timestep': row['mean_payload_per_req'] * row['req_count']
                                })
        
        sessions_dict[row['session_id']] = \
                                { 
                                'TTL': session_ttl_timesteps,
                                }

      else:

        sessions_dict[row['session_id']]['TTL'] = session_ttl_timesteps


    return { 'new_sess' : new_sess_reqs, 'sess_to_delete' : get_sessions_to_delete()}

In [None]:
ans = [pd.DataFrame(y) for x, y in week_ds.sort_values(by=['windowstart']).groupby('windowstart', as_index=False)]

In [None]:
from tqdm.notebook import tqdm as tqdmsito

sessions_dict = {}

session_ttl_timesteps = 10

optimized_ds = []

for current_time_frame_ds in tqdmsito(ans):

  optimized_ds.append(get_incoming_reqs(current_time_frame_ds))


HBox(children=(FloatProgress(value=0.0, max=28802.0), HTML(value='')))




In [None]:
import pickle

print('saving into',path )

with open( path + "opt_req_ds_25-29.p", "wb") as fp:   #Pickling

  pickle.dump(optimized_ds, fp)


saving into /content/drive/My Drive/cleanDatasetsCDN/network/live/


In [None]:
len(optimized_ds[2]['new_sess'])

19

In [None]:
with open(path + "opt_req_ds_25-29.txt", "rb") as fp:   # Unpickling
  
  optimized_DS_2 = pickle.load(fp)

NameError: ignored

### Optimized DS Formatting for exact optimizer:

In [6]:
import pandas as pd 

path = "/content/drive/My Drive/cleanDatasetsCDN/network/live/"

ds_29 = pd.read_csv(path+'requests29.csv')

ds_29 = ds_29.groupby(['region','channel','session_id', 'windowstart']).agg(max_br=('chunk_quality', 'max'), mean_payload_per_req=('mean_payload_per_req', 'mean'), req_count=('req_count', 'sum'))

ds_29 = ds_29.reset_index()

ds_29['windowstart'] = pd.to_datetime(ds_29['windowstart'])

ds_29['mean_payload_per_timestep'] = ds_29['mean_payload_per_req']  * ds_29['req_count']

In [8]:
ds_29

Unnamed: 0,region,channel,session_id,windowstart,max_br,mean_payload_per_req,req_count,mean_payload_per_timestep
0,0,0,RM220-0-0-1532751023-00001093.70.179.105,2018-07-28 23:59:45,96000,208.0000,1,208.000
1,0,0,RM220-0-0-1532751023-00001093.70.179.105,2018-07-29 00:00:00,96000,208.0000,2,416.000
2,0,0,RM220-0-0-1532751023-00001093.70.179.105,2018-07-29 00:00:30,786432,208.0000,2,416.000
3,0,0,RM220-0-0-1532751023-00001093.70.179.105,2018-07-29 00:00:45,786432,208.0000,2,416.000
4,0,0,RM220-0-0-1532751023-00001093.70.179.105,2018-07-29 00:01:00,786432,208.0000,4,832.000
...,...,...,...,...,...,...,...,...
6322349,3,40,BO220-0-3-1532904532-00000537.182.43.223,2018-07-29 22:49:15,2936012,348350.6000,12,4180207.200
6322350,3,40,BO220-0-3-1532904532-00000537.182.43.223,2018-07-29 22:49:30,2936012,352233.6215,14,4931270.701
6322351,3,40,BO220-0-3-1532904532-00000537.182.43.223,2018-07-29 22:49:45,2936012,339316.5000,10,3393165.000
6322352,3,40,BO220-0-3-1532905424-00000493.144.178.33,2018-07-29 23:03:45,2936012,345100.0830,5,1725500.415


#15 second timestep:

In [None]:
ans = [pd.DataFrame(y) for x, y in ds_29.sort_values(by=['windowstart']).groupby('windowstart', as_index=False)]

In [None]:
def get_sessions_to_delete():

  sessions_to_delete = []

  sessions = list(sessions_dict.keys())

  for session in sessions:

    sessions_dict[session]['TTL'] -= 1

    if sessions_dict[session]['TTL'] == 0:

      sessions_to_delete.append(session)

      del sessions_dict[session]

  return sessions_to_delete



def get_incoming_reqs(current_time_frame_ds):
    '''
    Returns the new session requests.

    '''
    new_sess_reqs = []

    for index, row in current_time_frame_ds.iterrows():

      if row['session_id'] not in sessions_dict.keys():

        new_sess_reqs.append({  'session_id' : row['session_id'],
                                'client_cluster' : row['region'],
                                'channel': row['channel'],
                                'br' : row['max_br'],
                                'mean_req_count' : row['req_count'],
                                'mean_payload_per_req': row['mean_payload_per_req'],
                                'mean_payload_per_timestep': row['mean_payload_per_req'] * row['req_count']
                                })
        
        sessions_dict[row['session_id']] = \
                                { 
                                'TTL': session_ttl_timesteps,
                                }

      else:

        sessions_dict[row['session_id']]['TTL'] = session_ttl_timesteps


    return { 'new_sess' : new_sess_reqs, 'sess_to_delete' : get_sessions_to_delete()}

In [None]:
from tqdm.notebook import tqdm as tqdmsito

sessions_dict = {}

session_ttl_timesteps = 10

optimized_ds = []

for current_time_frame_ds in tqdmsito(ans):

  optimized_ds.append(get_incoming_reqs(current_time_frame_ds))


In [None]:
len(optimized_ds)

5761

In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( "optimized_request_ds29_FOR_EXACTOR.txt", "wb") as fp:   #Pickling

  pickle.dump(optimized_ds, fp)


### New Version: (NO aggregations but sessions);:


In [None]:
import pickle
path = "/content/drive/My Drive/Colab Notebooks/ds/"

with open(path + "optimized_request_ds29_FOR_EXACTOR.txt", "rb") as fp:   # Unpickling
  
  b = pickle.load(fp)

#assert b == optimized_ds}

print(len(b))

5761


In [None]:
# Transforming to Pandas:
from tqdm.notebook import tqdm as tqdmsito

sessions_pandas_list = []

for entry in tqdmsito(b):

  sessions_pandas_list.append({'new_sess_pd': pd.DataFrame(entry['new_sess']),
                               'sess_to_delete_pd': pd.DataFrame(entry['sess_to_delete'])})
  


HBox(children=(FloatProgress(value=0.0, max=5761.0), HTML(value='')))




##### Scaling: 

In [None]:
import joblib

scalers_path = "/content/drive/My Drive/Colab Notebooks/scalers/"

# payload_scaler_filename = scalers_path + "payload_scaler.save"

bitrate_scaler_filename = scalers_path + "bitrate_scaler.save"

payload_per_timestep_scaler_filename = scalers_path + "payload_per_timestep_scaler.save"

bitrate_scaler = joblib.load(bitrate_scaler_filename)

payload_per_timestep_scaler = joblib.load(payload_per_timestep_scaler_filename)

In [None]:
for entry in tqdmsito(sessions_pandas_list):

  if len(entry['new_sess_pd'].count()) != 0:

    unscaled_payloads = entry['new_sess_pd']['mean_payload_per_timestep'].to_numpy().reshape(-1, 1)

    scaled_payloads = payload_per_timestep_scaler.transform(unscaled_payloads).reshape(1,-1)

    entry['new_sess_pd']['scaled_payload'] = pd.Series(scaled_payloads.squeeze())

    unscaled_bitrates = entry['new_sess_pd']['br'].to_numpy().reshape(-1, 1)

    scaled_bitrates = bitrate_scaler.transform(unscaled_bitrates).reshape(1,-1)

    entry['new_sess_pd']['scaled_br'] = pd.Series(scaled_bitrates.squeeze())


for entry in tqdmsito(sessions_pandas_list):

  if len(entry['sess_to_delete_pd'].count()) != 0:

    unscaled_payloads = entry['sess_to_delete_pd']['mean_payload_per_timestep'].to_numpy().reshape(-1, 1)

    scaled_payloads = payload_per_timestep_scaler.transform(unscaled_payloads).reshape(1,-1)

    entry['sess_to_delete_pd']['scaled_payload'] = pd.Series(scaled_payloads.squeeze())

    unscaled_bitrates = entry['sess_to_delete_pd']['br'].to_numpy().reshape(-1, 1)

    scaled_bitrates = bitrate_scaler.transform(unscaled_bitrates).reshape(1,-1)

    entry['sess_to_delete_pd']['scaled_br'] = pd.Series(scaled_bitrates.squeeze())

HBox(children=(FloatProgress(value=0.0, max=5761.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5761.0), HTML(value='')))




In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( path + "input_sessions_dataframes_29.txt", "wb") as fp:   #Pickling

  pickle.dump(sessions_pandas_list, fp)


In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"

with open(path + "input_sessions_dataframes_29.txt", "rb") as fp:   # Unpickling
  
  ef = pickle.load(fp)

print(len(ef))

5761


### Old Version (aggregations DS):

In [None]:
aggregated_ds = pd.DataFrame(b[112]['new_sess']).groupby(['client_cluster','channel']).agg(avg_br=('br', 'mean'), mean_payload=('mean_payload_per_timestep', 'mean'), session_count=('session_id', 'nunique'))
aggregated_ds = aggregated_ds.reset_index()

In [None]:
aggregated_ds = pd.DataFrame(b[112]['sess_to_delete']).groupby(['client_cluster','channel']).agg(avg_br=('br', 'mean'), mean_payload=('mean_payload_per_timestep', 'mean'), session_count=('session_id', 'nunique'))
aggregated_ds = aggregated_ds.reset_index()

In [None]:
from tqdm.notebook import tqdm

new_aggregations_datasets = []

for timestep in tqdm(range(0, len(b))):

  new_aggregations_ds = pd.DataFrame(b[timestep]['new_sess'])

  if len(new_aggregations_ds.count()) != 0:

    new_aggregations_ds =new_aggregations_ds.groupby(['client_cluster','channel'])\
      .agg(avg_br=('br', 'mean'), 
          mean_payload=('mean_payload_per_timestep', 'mean'), 
          session_count=('session_id', 'nunique'))
      
    new_aggregations_ds = new_aggregations_ds.reset_index()

    new_aggregations_datasets.append(new_aggregations_ds) 

  else:
    new_aggregations_datasets.append(None)
   

HBox(children=(FloatProgress(value=0.0, max=5761.0), HTML(value='')))




In [None]:
from tqdm.notebook import tqdm

aggregations_to_delete = []

for timestep in tqdm(range(0, len(b))):

  agg_to_delete_ds = pd.DataFrame(b[timestep]['sess_to_delete'])

  if len(agg_to_delete_ds.count()) != 0:

    agg_to_delete_ds = agg_to_delete_ds.groupby(['client_cluster','channel'])\
      .agg(avg_br=('br', 'mean'), 
          mean_payload=('mean_payload_per_timestep', 'mean'), 
          session_count=('session_id', 'nunique'))
      
    agg_to_delete_ds = agg_to_delete_ds.reset_index()

    aggregations_to_delete.append(agg_to_delete_ds) 

  else:
    aggregations_to_delete.append(None)
   

HBox(children=(FloatProgress(value=0.0, max=5761.0), HTML(value='')))




creating the scaling columns

In [None]:
for agg_to_delete_dataset in aggregations_to_delete:

  if agg_to_delete_dataset is not None:

    unscaled_payloads = agg_to_delete_dataset.mean_payload.to_numpy().reshape(-1, 1)

    scaled_payloads = payload_per_timestep_scaler.transform(unscaled_payloads).reshape(1,-1)

    agg_to_delete_dataset['scaled_payload'] = pd.Series(scaled_payloads.squeeze())


for agg_to_delete_dataset in aggregations_to_delete:

  if agg_to_delete_dataset is not None:

    unscaled_bitrates = agg_to_delete_dataset.avg_br.to_numpy().reshape(-1, 1)

    scaled_bitrates = bitrate_scaler.transform(unscaled_bitrates).reshape(1,-1)

    agg_to_delete_dataset['scaled_avg_br'] = pd.Series(scaled_bitrates.squeeze())

In [None]:
for agg_dataset in aggregations_datasets:

  if agg_dataset is not None:

    unscaled_payloads = agg_dataset.mean_payload.to_numpy().reshape(-1, 1)

    scaled_payloads = payload_per_timestep_scaler.transform(unscaled_payloads).reshape(1,-1)

    agg_dataset['scaled_payload'] = pd.Series(scaled_payloads.squeeze())


for agg_dataset in aggregations_datasets:

  if agg_dataset is not None:

    unscaled_bitrates = agg_dataset.avg_br.to_numpy().reshape(-1, 1)

    scaled_bitrates = bitrate_scaler.transform(unscaled_bitrates).reshape(1,-1)

    agg_dataset['scaled_avg_br'] = pd.Series(scaled_bitrates.squeeze())

In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( path + "new_aggregations_dataset_29.txt", "wb") as fp:   #Pickling

  pickle.dump(new_aggregations_datasets, fp)


In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( path + "aggregations_to_delete_dataset_29.txt", "wb") as fp:   #Pickling

  pickle.dump(aggregations_to_delete, fp)


In [None]:
import pickle
path = "/content/drive/My Drive/Colab Notebooks/ds/"

with open(path + "new_aggregations_dataset_29.txt", "rb") as fp:   # Unpickling
  
  ef = pickle.load(fp)

#assert b == optimized_ds}

print(len(ef))

5761


In [None]:
import pickle
path = "/content/drive/My Drive/Colab Notebooks/ds/"

with open(path + "aggregations_to_delete_dataset_29.txt", "rb") as fp:   # Unpickling
  
  af = pickle.load(fp)

#assert b == optimized_ds}

print(len(af))

5761


#### Adapter DS: (request cubes)

##### new version:

In [None]:
ds_28 = pd.read_csv('/content/drive/MyDrive/cleanDatasetsCDN/network/live/requests28.csv')
ds_29 = pd.read_csv('/content/drive/MyDrive/cleanDatasetsCDN/network/live/requests29.csv')
ds_30 = pd.read_csv('/content/drive/MyDrive/cleanDatasetsCDN/network/live/requests30.csv')
ds_weekend = pd.concat([ds_28, ds_29, ds_30])

In [None]:
ds_weekend = ds_weekend.groupby(['region','channel','session_id', 'windowstart']).agg(max_br=('chunk_quality', 'max'), mean_payload_per_req=('mean_payload_per_req', 'mean'), req_count=('req_count', 'sum'))
ds_weekend = ds_weekend.reset_index()
ds_weekend['windowstart'] = pd.to_datetime(ds_weekend['windowstart'])
ds_weekend['mean_payload_per_timestep'] = ds_weekend['mean_payload_per_req']  * ds_weekend['req_count']
aggregated_ds = ds_weekend.groupby(['region','channel', 'windowstart']).agg(avg_br=('max_br', 'mean'), mean_payload=('mean_payload_per_timestep', 'mean'), session_count=('session_id', 'nunique'))
aggregated_ds = aggregated_ds.reset_index()

In [None]:
aggregated_ds

Unnamed: 0,region,channel,windowstart,avg_br,mean_payload,session_count
0,0,0,2018-07-27 23:59:30,786432.0,6.313333e+02,3
1,0,0,2018-07-27 23:59:45,786432.0,6.586667e+02,6
2,0,0,2018-07-28 00:00:00,786432.0,2.080000e+03,12
3,0,0,2018-07-28 00:00:15,786432.0,2.711346e+03,12
4,0,0,2018-07-28 00:00:30,786432.0,2.861844e+03,12
...,...,...,...,...,...,...
2567069,3,40,2018-07-30 23:58:30,200.0,3.583140e+05,2
2567070,3,40,2018-07-30 23:58:45,200.0,1.031278e+06,2
2567071,3,40,2018-07-30 23:59:00,200.0,1.028374e+06,2
2567072,3,40,2018-07-30 23:59:15,200.0,4.988775e+06,4


In [None]:
aggregated_ds['windowstart'] = pd.to_datetime(aggregated_ds['windowstart'])

In [None]:
import datetime

def get_session_count(step_ds, cp, uc):

  filtered = step_ds[(step_ds.channel == cp) & (step_ds.region == uc)]

  sessions = filtered.session_count.values

  if len(sessions) > 0:

    return sessions[0]

  else:

    return 0

def get_aggregations_tensor_new(ds):

  timesteps = pd.Series(ds['windowstart'].unique())

  delta_t = datetime.timedelta(seconds = 15)

  incomming_sessions_tensor = torch.zeros((len(timesteps),N_cp,N_uc))

  t = 0

  for idx, timestep in enumerate(timesteps):

    end_timestep = timestep + delta_t

    step_ds = ds[ds['windowstart'] >= timestep]
    step_ds = step_ds[step_ds['windowstart'] < (timestep + delta_t)]


    for cp in range(0,N_cp):

      for uc in range(0,N_uc):

        incomming_sessions_tensor[idx][cp][uc] = get_session_count(step_ds,cp,uc)

    t = t + 1  

    if t%1000 == 0:
      print(t)

    

  return incomming_sessions_tensor


In [None]:
N_cp = 41

N_uc = 4

aggregations_tensor = get_aggregations_tensor_new(aggregated_ds)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


In [None]:
timesteps = aggregated_ds['windowstart'].unique()


In [None]:
timesteps[-100:]

array(['2018-07-30T23:44:45.000000000', '2018-07-30T23:45:00.000000000',
       '2018-07-30T23:45:15.000000000', '2018-07-30T23:45:30.000000000',
       '2018-07-30T23:45:45.000000000', '2018-07-30T23:46:00.000000000',
       '2018-07-30T23:46:15.000000000', '2018-07-30T23:46:30.000000000',
       '2018-07-30T23:46:45.000000000', '2018-07-30T23:47:00.000000000',
       '2018-07-30T23:47:15.000000000', '2018-07-30T23:47:30.000000000',
       '2018-07-30T23:47:45.000000000', '2018-07-30T23:48:00.000000000',
       '2018-07-30T23:48:15.000000000', '2018-07-30T23:48:30.000000000',
       '2018-07-30T23:48:45.000000000', '2018-07-30T23:49:00.000000000',
       '2018-07-30T23:49:15.000000000', '2018-07-30T23:49:30.000000000',
       '2018-07-30T23:49:45.000000000', '2018-07-30T23:50:00.000000000',
       '2018-07-30T23:50:15.000000000', '2018-07-30T23:50:30.000000000',
       '2018-07-30T23:50:45.000000000', '2018-07-30T23:51:00.000000000',
       '2018-07-30T23:51:15.000000000', '2018-07-30

##### old version:


In [None]:
aggregated_ds = ds_25.groupby(['region','channel', 'windowstart']).agg(avg_br=('max_br', 'mean'), mean_payload=('mean_payload_per_timestep', 'mean'), session_count=('session_id', 'nunique'))

aggregated_ds = aggregated_ds.reset_index()

In [None]:
aggregated_ds.count()

In [None]:
def get_session_count(step_ds, cp, uc):

  filtered = step_ds[(step_ds.channel == cp) & (step_ds.region == uc)]

  sessions = filtered.session_count.values

  if len(sessions) > 0:

    return sessions[0]

  else:

    return 0

In [None]:
def get_aggregations_tensor(ds):

  timesteps = ds['windowstart']

  delta_t = datetime.timedelta(seconds = 15)

  incomming_sessions_tensor = torch.zeros((N_cp,N_uc,len(timesteps)))

  t = 0

  for timestep in timesteps:

    print('\n\n', timestep, '\n\n')

    step_ds = \
      ds[ds['windowstart'] >= timestep]\
      [ds['windowstart'] < (timestep + delta_t)]

    for cp in range(0,N_cp):

      for uc in range(0,N_uc):

        incomming_sessions_tensor[cp][uc][t] = get_session_count(step_ds,cp,uc)

    t = t + 1  

  return incomming_sessions_tensor


In [None]:
N_cp = 41
N_uc = 4
aggregations_tensor = get_aggregations_tensor(aggregated_ds)

In [None]:
import pickle

path = "/content/drive/MyDrive/cleanDatasetsCDN/network/live/"

with open( "aggregations_tensor_weekend_NEW_VERSION.txt", "wb") as fp:   #Pickling

  pickle.dump(aggregations_tensor, fp)


In [None]:
with open(path + "aggregations_tensor_weekend_NEW_VERSION.txt", "rb") as fp:   # Unpickling
  
  b = pickle.load(fp)

print(len(b))

b[0].shape

17281


torch.Size([41, 4])

In [None]:
b

tensor([[[ 3.,  5., 16.,  3.],
         [ 6., 11.,  9.,  4.],
         [ 5.,  0.,  5.,  2.],
         ...,
         [ 8.,  3.,  5.,  5.],
         [ 0.,  3.,  1.,  2.],
         [ 3.,  2.,  9.,  4.]],

        [[ 6.,  5., 16.,  3.],
         [ 9., 11.,  9.,  4.],
         [ 5.,  0.,  7.,  2.],
         ...,
         [ 8.,  3.,  5.,  5.],
         [ 1.,  3.,  2.,  2.],
         [ 4.,  2.,  9.,  3.]],

        [[12.,  5., 16.,  3.],
         [12., 11.,  9.,  4.],
         [ 3.,  0.,  3.,  2.],
         ...,
         [ 6.,  3.,  5.,  5.],
         [ 1.,  3.,  0.,  2.],
         [ 3.,  2.,  9.,  3.]],

        ...,

        [[ 0.,  4., 12.,  7.],
         [ 2.,  5.,  1.,  3.],
         [ 3.,  0.,  3.,  0.],
         ...,
         [ 5.,  1.,  7.,  2.],
         [ 0.,  2.,  1.,  0.],
         [ 7.,  3.,  6.,  5.]],

        [[ 0.,  4., 12.,  7.],
         [ 2.,  5.,  1.,  3.],
         [ 3.,  0.,  3.,  0.],
         ...,
         [ 5.,  1.,  5.,  2.],
         [ 0.,  2.,  1.,  0.],
         

In [None]:
request_cubes = []

for end_sec in tqdm(range(0, 5762)):

  new_request_cube = torch.Tensor(41,4,15)

  for idx in range(1,16):

    for cp in range(0,41):

      for uc in range(0,4):

        t_idx = end_sec-(idx-1)

        if t_idx >= 0:
        
          new_request_cube[cp][uc][-idx] = aggregations_tensor[cp][uc][t_idx]

        else:

          new_request_cube[cp][uc][-idx] = 0


  request_cubes.append(new_request_cube)



In [None]:
import pickle

path = "/content/drive/My Drive/Colab Notebooks/ds/"


with open( path + "request_cubes_28.txt", "wb") as fp:   #Pickling

  pickle.dump(request_cubes, fp)