# Meta Learning Cache Replacement Policy

## Install Dependency

In [None]:
# %load extract_tar_gz.py
import argparse
import os


EXTENSION = '.tar.gz'
BLKPARSE = '.blkparse'


def main(input_dir, output_dir):
  assert os.path.isdir(input_dir), "The input directory {} does not exist".format(input_dir)

  input_dir = input_dir.rstrip("/")
  if not os.path.isdir(output_dir):
    print("Creating output directory {}".format(output_dir))
    os.mkdir(output_dir)

  for file in os.listdir(input_dir):
    if not file.endswith(EXTENSION):
      continue
    
    print("Extracting and moving {}".format(file))
    execute_command("tar -xvzf {}".format(input_dir + '/' + file))
    execute_command("mv ./*{} {}".format(BLKPARSE, output_dir))
    

def execute_command(command):
  os.system(command)


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument("--input_dir", type=str, help="the directory containing the tar.gz files", required=True)
  parser.add_argument("--output_dir", type=str, help="the directory containing content of the tar.gz files", required=True)
  args = parser.parse_args() 
  main(args.input_dir, args.output_dir)


In [26]:
import sys
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm 
from collections import Counter, deque, defaultdict
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
import tensorflow as tf


## Decompress tar.gz

**Don't run unless you have a lot of storage!**

In [None]:
input_dir = "FIU_raw"
output_dir = "FIU_trace"
main(input_dir, output_dir)

## Block Cache Model

### Global Variable

In [62]:
# Maximum block number
maxpos = 1000000000000

# Number of features (Recency, Frequency, Block No.)
num_params = 3

# Cache Size
cache_size = 100

# Sequence Length
sequence_length = 5


sampling_freq = cache_size

# How many % of cache to use
eviction = int(0.7 * cache_size)  

# Results
lruCorrect = 0
lruIncorrect = 0

lfuCorrect = 0
lfuIncorrect = 0

# Variables
X = np.array([], dtype=np.int64).reshape(0,num_params)
Y = np.array([], dtype=np.int64).reshape(0,1)

### Load workload
**cheetah.cs.fiu.edu-110108-113008.1.blkparse** does not contain correct data.

In [29]:
train = "FIU_trace/cheetah.cs.fiu.edu-110108-113008.2.blkparse"

df = pd.read_csv(train, sep=' ',header = None)
df.columns = ['timestamp','pid','pname','blockNo', \
              'blockSize', 'readOrWrite', 'bdMajor', 'bdMinor', 'hash']

trainBlockTrace = df['blockNo'].tolist()
trainBlockTrace = trainBlockTrace[:int(len(trainBlockTrace)*0.1)]

len(trainBlockTrace)

2271127

### Data Preprocessing and Data Construction

In [76]:
# Taken from Shehbaz
def get_recency(lru, cache):
    recency = []
    recency_dict = defaultdict(int)
    
    # Compute the recency order of each page in cache
    for time in range(len(lru)):
        recency_dict[lru[time]] = time
        
    for block in cache:
        recency.append(recency_dict[block])

    return recency

def get_frequency(lfu, cache):
    frequency = []
    
    for block in cache:
        frequency.append(lfu[block])
    return frequency

def normalize_columns(input):
    return normalize(input, axis=0)

In [83]:
def get_single_length_input(lfu, lru, cache, preprocess_func):
    input_recency = get_recency(lru, cache)
    input_frequency = get_frequency(lfu, cache)
    input_block_num = cache[:]
    
    # Columns: recency, frequency, block number
    # Row: cache location
    raw_input = np.column_stack((input_recency, input_frequency, input_block_num))
    
    return preprocess_func(raw_input)


SEQ_DIM = 0
def get_multiple_length_input(sequence_length, prev_inputs, lfu, lru, cache, preprocess_func):
    assert prev_inputs.shape[SEQ_DIM] == sequence_length
    current_input = get_single_length_input(lfu, lru, cache, preprocess_func)
    return np.vstack((prev_inputs[1:], current_input[None]))
    

def get_output(pre_cache, post_cache):
    pass

# Taken from Shehbaz.
def getY(C,D):
    assert(len(C) == len(D))
    Y_current = []
    KV_sorted = Counter(D)
    evict_dict = dict(KV_sorted.most_common(eviction))
    assert(len(evict_dict) == eviction)
    all_vals = evict_dict.values()
    for e in C:
        if e in evict_dict.values():
            Y_current.append(1)
        else:
            Y_current.append(0)
    #print (Y_current.count(1))
    assert(Y_current.count(1) == eviction)
    assert((set(all_vals)).issubset(set(C)))
    return Y_current

### Belady Optimal Algorithm (From Shehbaz)

In [None]:
def belady_opt(blocktrace, frame):
    global maxpos
    
    optimal = defaultdict(deque)
    deleted = defaultdict(int)
    lfu = defaultdict(int)
    lru = []

    # Build the whole index for finding optimal eviction ordering
    for request_time, block in enumerate(tqdm(blocktrace, desc="OPT: building index")):
        optimal[block].append(request_time)

    hit, miss = 0, 0

    cache = []
    
    for request_time, block in enumerate(tqdm(blocktrace, desc="OPT")):
        # increase frequency count
        lfu[block] +=1

        # Remove the block i at time step j from the index
        if len(optimal[block]) is not 0 and optimal[block][0] == request_time:
            optimal[block].popleft()

        
        if block in cache:
            # Cache Hit
            # Update block to MRU position
            hit += 1
            lru.remove(block)
            lru.append(block)
            
            assert request_time in deleted
            
            del deleted[request_time]
            if len(optimal[block]) is not 0:
                deleted[optimal[block][0]] = block
                optimal[block].popleft()
            else:
                deleted[maxpos] = block
                maxpos -= 1
        else:
            # Cache Miss
            miss+=1
            if len(cache) == frame:
                assert(len(deleted) == frame)
                evictpos = max(deleted)
                
                if (seq_number % sampling_freq +1 == sampling_freq):
                    #Y_OPT = populateData(LFUDict, LRUQ, C, D, CacheTS, CachePID)
                    Y_OPT = populateData(LFUDict, LRUQ, C, D)
                    lruPredict(C,LRUQ,Y_OPT)
                    lfuPredict(C,LFUDict,Y_OPT)
                
                C[C.index(D[evictpos])] = block
                LRUQ.remove(D[evictpos])
                del D[evictpos]
            else:
                C.append(block)
                
            if len(OPT[block]) is not 0:
                D[OPT[block][0]] = block
                OPT[block].popleft()
            else:
                D[maxpos] = block
                maxpos -= 1
            LRUQ.append(block)


    hitrate = hit / (hit + miss)

    return hitrate

## Generate training data

In [None]:
trainHitrate = belady_opt(trainBlockTrace, cache_size)
X_train = X
Y_train = Y

#### Sample test

In [85]:
lfu = {2: 1, 3: 2, 4: 1}
lru = [2, 3, 4]
cache = [3, 2, 4]
hidden = np.zeros((5, 3, 3))

hidden = get_multiple_length_input(5, hidden, lfu, lru, cache, lambda x: x)
print(hidden)
hidden = get_multiple_length_input(5, hidden, lfu, lru, cache, normalize_columns)
print(hidden)

[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[1. 2. 3.]
  [0. 1. 2.]
  [2. 1. 4.]]]
[[[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[0.         0.         0.        ]
  [0.         0.         0.        ]
  [0.         0.         0.        ]]

 [[1.         2.         3.        ]
  [0.         1.         2.        ]
  [2.         1.         4.        ]]

 [[0.4472136  0.81649658 0.55708601]
  [0.         0.40824829 0.37139068]
  [0.89442719 0.40824829 0.74278135]]]


In [91]:
blocktrace = trainBlockTrace
OPT = defaultdict(deque)
for i, block in enumerate(tqdm(blocktrace, desc="OPT: building index")):
        OPT[block].append(i)
OPT

HBox(children=(IntProgress(value=0, description='OPT: building index', max=2271127, style=ProgressStyle(descri…




defaultdict(collections.deque,
            {18579576: deque([0,
                    455484,
                    939522,
                    1013662,
                    1021219,
                    1170906,
                    1356731,
                    1356732,
                    1502856,
                    1533821,
                    1604320,
                    1675364,
                    1675365,
                    1678534,
                    1832100,
                    1879767,
                    1942179,
                    2028264]),
             508516672: deque([1,
                    15,
                    7076,
                    7088,
                    7111,
                    7142,
                    1675257,
                    1679821,
                    1686471,
                    1686900,
                    1687427,
                    1690044,
                    1692619,
                    1694750]),
             508516680: deque([2,
             