In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt

In [2]:
ori_df = pd.read_csv('../../traces/MSR-Cambridge2/CAMRESSDPA01-lvm0.csv', header=None)
ori_df.columns = ['Timestamp', 'Hostname', 'DiskNumber', 'Type', 'Offset', 'Size', 'ResponseTime']
ori_df['Timestamp'] = ori_df['Timestamp'].astype(np.int64)
ori_df = ori_df.sort_values(by='Timestamp')
ori_df.head()
#df.tail()

Unnamed: 0,Timestamp,Hostname,DiskNumber,Type,Offset,Size,ResponseTime
0,128166373321652068,src1,0,Write,3173167104,36864,3126
1,128166373321653856,src1,0,Write,3154128896,4096,1337
2,128166373321653876,src1,0,Write,3154132992,4096,1318
3,128166373371652876,src1,0,Write,3744964608,4096,1998
4,128166373421651545,src1,0,Write,3173203968,32768,3009


In [3]:
print(ori_df['Hostname'].value_counts())
print(ori_df['DiskNumber'].value_counts())
ori_req_num = ori_df['Timestamp'].count()
ori_req_num

src1    37415613
Name: Hostname, dtype: int64
0    37415613
Name: DiskNumber, dtype: int64


37415613

In [4]:
total_size = ori_df.drop_duplicates(['Offset'])['Size'].sum()
total_size

164910662144

In [5]:
max_cache_size = 10000000000
cache_size_array = [max_cache_size*0.01, max_cache_size*0.05, max_cache_size*0.1, max_cache_size*0.5, max_cache_size]

In [6]:
# Take first 10% of the trace
sample_num = int(ori_req_num * 0.5)
df = ori_df.head(sample_num)

In [7]:
req_num = df['Timestamp'].count()
req_num

18707806

In [8]:
total_size = df.drop_duplicates(['Offset'])['Size'].sum()
total_size

155249052160

In [9]:
workload_of_caches = [0, 0, 0, 0]
for row in df.drop_duplicates(['Offset']).itertuples():
    workload_of_caches[(getattr(row, 'Offset') // 10000 % 1000000) & 0b11111 % 4] += 1
workload_of_caches        # 检验哈希函数合理性

[1342467, 1334327, 1334319, 1334166]

In [10]:
def make_requests():
    for row in df.itertuples():
        yield getattr(row, 'Offset'), getattr(row, 'Size')

In [11]:
class Server:                                       # 服务器(cache)
    def __init__(self, space):
        self.space = space                          # cache大小
        self.remain = space                         # cache剩余空间
        self.cache = OrderedDict()                  # OrderDict() 模拟cache LRU方法
        self.hit_count = 0                          # 命中次数
        self.bite_hit_count = 0
        self.miss_count = 0                         # 未命中次数
        self.bite_miss_count = 0

    def _hit(self, fid, size):
        self.hit_count += 1
        self.bite_hit_count += size
        self.cache.move_to_end(fid)

    def _miss(self, fid, size):
        self.miss_count += 1
        self.bite_miss_count += size
        while self.remain < size:
            self.remain += self.cache.popitem(last=False)[-1]  # pop出第一个item
        self.cache[fid] = size
        self.remain -= size

    def handle(self, fid, size):                         # 处理一次请求
        if fid in self.cache.keys():
            self._hit(fid, size)
            #return 1
        else:
            self._miss(fid, size)
            #return 0

    def hit_rate(self):
        try:
            return self.hit_count / (self.hit_count + self.miss_count)
        except:
            return "Server has not been requested yet!"
    
    def bite_hit_rate(self):
        try:
            return self.bite_hit_count / (self.bite_hit_count + self.bite_miss_count)
        except:
            return "Server has not been requested yet!"
    
    def get_occupation(self):
        return self.space - self.remain

In [12]:
class Dispatcher:
    def __init__(self, file_number, cache_size, cache_number, simple=True):
        self.cache_number = cache_number
        self.big_cache = Server(cache_size * cache_number)
        self.small_caches = []
        for i in range(cache_number):
            server = Server(cache_size)
            self.small_caches.append(server)
        self.small_caches_heat = [0] * cache_number
        if simple:
            self.handle_requests = self.simple_hash
        else:
            self.handle_requests = self.load_balance
            self.file_mapper = {}
            for row in df.drop_duplicates(['Offset']).itertuples():
                fid = getattr(row,'Offset')
                self.file_mapper[fid] = (fid // 10000 % 1000000) & 0b11111 % cache_number
        
    def load_balance(self, fid, size):
        server = self.file_mapper[fid]
        if fid in self.small_caches[server].cache.keys():
            self.small_caches[server].handle(fid, size)
            self.small_caches_heat[self.file_mapper[fid]] += size
        else:
            server = self.small_caches_heat.index(min(self.small_caches_heat))
            self.small_caches[server].handle(fid, size)
            self.file_mapper[fid] = server
        self.big_cache.handle(fid, size)
        
    def simple_hash(self, fid, size):
        self.big_cache.handle(fid, size)
        self.small_caches[(fid // 10000 % 1000000) & 0b11111 % self.cache_number].handle(fid, size)

In [13]:
FILE_NUM = len(df['Offset'].value_counts())
CACHE_NUMBER = 4
print(FILE_NUM)
print(CACHE_NUMBER)

5345279
4


In [14]:
'''small_server_hit_rate = []
small_server_hit_number = []
small_server_hit_ratio = []
small_server_hit_ratio_syn = []
big_server_hit_rate = []
big_server_hit_number = []
big_server_hit_ratio = []
big_server_hit_ratio_syn = []

cache_size_array = []
for cache_size in range(total_size // 100, total_size // 10, total_size // 100):
    cache_size_array.append(cache_size / total_size)
    hash_dispatcher = Dispatcher(FILE_NUM, cache_size, CACHE_NUMBER)
    for fid, size in make_requests():
        hash_dispatcher.handle_requests(fid, size)
    small_server_hit_rate.append([i.hit_rate() for i in hash_dispatcher.small_caches])
    small_server_hit_number.append(sum([i.hit_count for i in hash_dispatcher.small_caches]))
    small_server_hit_ratio.append((sum([i.hit_count for i in hash_dispatcher.small_caches]))/req_num)
    big_server_hit_rate.append(hash_dispatcher.big_cache.hit_rate())
    big_server_hit_number.append(hash_dispatcher.big_cache.hit_count)
    big_server_hit_ratio.append((sum([i.hit_count for i in hash_dispatcher.small_caches]))/req_num)'''

'small_server_hit_rate = []\nsmall_server_hit_number = []\nsmall_server_hit_ratio = []\nsmall_server_hit_ratio_syn = []\nbig_server_hit_rate = []\nbig_server_hit_number = []\nbig_server_hit_ratio = []\nbig_server_hit_ratio_syn = []\n\ncache_size_array = []\nfor cache_size in range(total_size // 100, total_size // 10, total_size // 100):\n    cache_size_array.append(cache_size / total_size)\n    hash_dispatcher = Dispatcher(FILE_NUM, cache_size, CACHE_NUMBER)\n    for fid, size in make_requests():\n        hash_dispatcher.handle_requests(fid, size)\n    small_server_hit_rate.append([i.hit_rate() for i in hash_dispatcher.small_caches])\n    small_server_hit_number.append(sum([i.hit_count for i in hash_dispatcher.small_caches]))\n    small_server_hit_ratio.append((sum([i.hit_count for i in hash_dispatcher.small_caches]))/req_num)\n    big_server_hit_rate.append(hash_dispatcher.big_cache.hit_rate())\n    big_server_hit_number.append(hash_dispatcher.big_cache.hit_count)\n    big_server_hit_

In [15]:
'''plt.figure(figsize=(15,8))
plt.plot(cache_size_array, small_server_hit_number, color='red', label='little caches')
plt.plot(cache_size_array, big_server_hit_number, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit number")
plt.title("simple hash")
plt.legend()
#small_server_hit_ratio
df.tail()
#print(df['Hostname'].value_counts())
#df['Hostname'].value_counts()
df['Hostname'].count()'''

'plt.figure(figsize=(15,8))\nplt.plot(cache_size_array, small_server_hit_number, color=\'red\', label=\'little caches\')\nplt.plot(cache_size_array, big_server_hit_number, color=\'black\', label=\'big cache\')\nplt.xlabel("little cache size / file pool total size")\nplt.ylabel("hit number")\nplt.title("simple hash")\nplt.legend()\n#small_server_hit_ratio\ndf.tail()\n#print(df[\'Hostname\'].value_counts())\n#df[\'Hostname\'].value_counts()\ndf[\'Hostname\'].count()'

In [16]:
'''plt.figure(figsize=(15,8))
plt.plot(cache_size_array, small_server_hit_ratio, color='red', label='little caches')
plt.plot(cache_size_array, big_server_hit_ratio, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit ratio")
plt.title("simple hash")
plt.legend()'''

'plt.figure(figsize=(15,8))\nplt.plot(cache_size_array, small_server_hit_ratio, color=\'red\', label=\'little caches\')\nplt.plot(cache_size_array, big_server_hit_ratio, color=\'black\', label=\'big cache\')\nplt.xlabel("little cache size / file pool total size")\nplt.ylabel("hit ratio")\nplt.title("simple hash")\nplt.legend()'

In [17]:
'''colors = ['green', 'yellow', 'blue', 'red']
plt.figure(figsize=(15,8))
for i in range(CACHE_NUMBER):
    plt.plot(cache_size_array, [j[i] for j in small_server_hit_rate], color=colors[i], label='little cache ' + str(i + 1))
plt.plot(cache_size_array, big_server_hit_rate, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit rate")
plt.title("simple hash")
plt.legend()'''

'colors = [\'green\', \'yellow\', \'blue\', \'red\']\nplt.figure(figsize=(15,8))\nfor i in range(CACHE_NUMBER):\n    plt.plot(cache_size_array, [j[i] for j in small_server_hit_rate], color=colors[i], label=\'little cache \' + str(i + 1))\nplt.plot(cache_size_array, big_server_hit_rate, color=\'black\', label=\'big cache\')\nplt.xlabel("little cache size / file pool total size")\nplt.ylabel("hit rate")\nplt.title("simple hash")\nplt.legend()'

In [None]:
small_server_hit_rate_ = []
small_server_hit_number_ = []
small_server_hit_ratio_ = []
small_server_hit_number_syn_ = []
small_server_hit_ratio_syn_ = []

small_server_occupation_syn_ = []
small_server_remain_syn_ = []
small_server_occupation = []
small_server_remain = []

big_server_hit_rate_ = []
big_server_hit_number_ = []
big_server_hit_ratio_ = []
big_server_hit_number_syn_ = []
big_server_hit_ratio_syn_ = []

big_server_occupation_syn_ = []
big_server_remain_syn_ = []
big_server_occupation = []
big_server_remain = []

#small_server_hit_seq = []
#big_server_hit_seq = []

cache_size_array_ = []
#for cache_size in range(total_size // 100, total_size // 10, total_size // 100):
#    cache_size_array_.append(cache_size / total_size)
for cache_size in cache_size_array:
    cache_size_array_.append(cache_size)
    hash_dispatcher = Dispatcher(FILE_NUM, cache_size, CACHE_NUMBER, False)
    cur_req = 0
    sub_big_server_hit_ratio_syn_ = []
    sub_small_server_hit_ratio_syn_ = []
    
    sub_big_server_hit_number_syn_ = []
    sub_small_server_hit_number_syn_ = []
    
    sub_big_server_occupation_syn_ = []
    sub_small_server_occupation_syn_ = []
    
    for fid, size in make_requests():
        hash_dispatcher.handle_requests(fid, size)
        
        
        cur_req += 1
        
        sub_big_server_hit_ratio_syn_.append(hash_dispatcher.big_cache.hit_count/cur_req)
        sub_small_server_hit_ratio_syn_.append(sum([i.hit_count for i in hash_dispatcher.small_caches])/cur_req)
        
        sub_big_server_hit_number_syn_.append(hash_dispatcher.big_cache.hit_count)
        sub_small_server_hit_number_syn_.append(sum([i.hit_count for i in hash_dispatcher.small_caches]))
        
        sub_big_server_occupation_syn_.append(hash_dispatcher.big_cache.space - hash_dispatcher.big_cache.remain)
        sub_small_server_occupation_syn_.append(sum([(i.hash_dispatcher.big_cache.space - i.hash_dispatcher.big_cache.remain) for i in hash_dispatcher.small_caches]))
        
        sub_big_server_remain.append(hash_dispatcher.big_cache.remain)
        sub_small_server_remain.append(sum([(i.hash_dispatcher.big_cache.remain) for i in hash_dispatcher.small_caches]))
        
    big_server_hit_ratio_syn_.append(sub_big_server_hit_ratio_syn_)
    small_server_hit_ratio_syn_.append(sub_small_server_hit_ratio_syn_)
    
    big_server_hit_number_syn_.append(sub_big_server_hit_number_syn_)
    small_server_hit_number_syn_.append(sub_small_server_hit_number_syn_)
    
    big_server_occupation_syn_.append(sub_big_server_occupation_syn_)
    small_server_occupation_syn_.append(sub_small_server_occupation_syn_)
    
    big_server_remain_syn_.append(sub_big_server_remain_syn_)
    small_server_remain_syn_.append(sub_small_server_remain_syn_)
        
    small_server_hit_rate_.append([i.hit_rate() for i in hash_dispatcher.small_caches])
    small_server_hit_number_.append(sum([i.hit_count for i in hash_dispatcher.small_caches]))
    small_server_hit_ratio_.append(sum([i.hit_count for i in hash_dispatcher.small_caches])/req_num)

    big_server_hit_rate_.append(hash_dispatcher.big_cache.hit_rate())
    big_server_hit_number_.append(hash_dispatcher.big_cache.hit_count)
    big_server_hit_ratio_.append(hash_dispatcher.big_cache.hit_count/req_num)

In [None]:
plt.figure(figsize=(15,8))
plt.plot(cache_size_array_, small_server_hit_number_, color='red', label='little caches')
plt.plot(cache_size_array_, big_server_hit_number_, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit number")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
plt.figure(figsize=(15,8))
plt.plot(cache_size_array_, small_server_hit_ratio_, color='red', label='little caches')
plt.plot(cache_size_array_, big_server_hit_ratio_, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit number")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(len(small_server_hit_ratio_syn_[4])), small_server_hit_ratio_syn_[4], color='red', label='little caches')
plt.plot(range(len(big_server_hit_ratio_syn_[4])), big_server_hit_ratio_syn_[4], color='black', label='big cache')
plt.xlabel("request number")
plt.ylabel("hit ratio")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
colors = ['green', 'yellow', 'blue', 'red']
plt.figure(figsize=(15,8))
for i in range(CACHE_NUMBER):
    plt.plot(cache_size_array_, [j[i] for j in small_server_hit_rate_], color=colors[i], label='little cache ' + str(i + 1))
plt.plot(cache_size_array_, big_server_hit_rate_, color='black', label='big cache')
plt.xlabel("little cache size / file pool total size")
plt.ylabel("hit rate")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
x = df['Offset'].value_counts().values

In [None]:
plt.figure(figsize=(15,8))
bins = np.arange(0, 200, 1)
#plt.hist(x[2:], bins, alpha=0.5)
plt.hist(x[2:], bins, alpha=0.5, log = True)
#plt.xlim(0, 1000)
#plt.yscale('log')
plt.legend()

In [None]:
plt.figure(figsize=(15,8))
bins = np.arange(0, 25000, 10)
#plt.hist(x[2:], bins, alpha=0.5)
plt.hist(x[2:], bins, alpha=0.5, log = True)
#plt.xlim(0, 1000)
#plt.yscale('log')
plt.legend()

In [None]:
'''small_server_hit_seq = []
big_server_hit_seq = []

ini_hit = 0

for i in range(len(small_server_hit_number_syn_[4])):
    req = small_server_hit_number_[4][i]
    if i == 0:
        prevreq = ini_hit
    else:
        prevreq = small_server_hit_number_[4][i-1]
    small_server_hit_seq.append(req-prevreq)

for i in range(len(big_server_hit_number_syn_[4])):
    req = big_server_hit_number_[4][i]
    if i == 0:
        prevreq = ini_hit
    else:
        prevreq = big_server_hit_number_[4][i-1]
    big_server_hit_seq.append(req-prevreq)'''
        
    

In [None]:
print(len(sub_small_server_hit_number_syn_))

In [None]:
small_server_hit_seq = []
big_server_hit_seq = []

ini_hit = 0

for i in range(len(sub_small_server_hit_number_syn_)):
    req = sub_small_server_hit_number_syn_[i]
    if i == 0:
        prevreq = ini_hit
    else:
        prevreq = sub_small_server_hit_number_syn_[i-1]
    small_server_hit_seq.append(req-prevreq)

for i in range(len(sub_big_server_hit_number_syn_)):
    req = sub_big_server_hit_number_syn_[i]
    if i == 0:
        prevreq = ini_hit
    else:
        prevreq = sub_big_server_hit_number_syn_[i-1]
    big_server_hit_seq.append(req-prevreq)

In [None]:
diff_small_and_big = []

for i in range(len(small_server_hit_seq)):
    if (small_server_hit_seq[i] > big_server_hit_seq[i]):
        diff_small_and_big.append(i)

In [None]:
plt.figure(figsize=(15,8))
#plt.plot(range(len(small_server_hit_ratio_syn_[4])), small_server_hit_ratio_syn_[4], color='red', label='little caches')
plt.plot(range(len(diff_small_and_big)), diff_small_and_big, color='red', label='big cache')
plt.xlabel("difference number")
plt.ylabel("different matching sequence")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
diff_small_and_big_seq = []

for i in range(len(small_server_hit_seq)):
    diff_small_and_big_seq.append(small_server_hit_seq[i] - big_server_hit_seq[i])
    
print(len(diff_small_and_big_seq))

In [None]:
plt.figure(figsize=(15,8))
#plt.plot(range(len(small_server_hit_ratio_syn_[4])), small_server_hit_ratio_syn_[4], color='red', label='little caches')
plt.plot(range(len(diff_small_and_big_seq)), diff_small_and_big_seq, color='red', label='big cache')
plt.xlabel("difference number")
plt.ylabel("different matching sequence")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
diff_small_and_big_seq_accumulation = []

acc = 0

for i in range(len(small_server_hit_seq)):
    #diff_small_and_big_seq.append(small_server_hit_seq[i] - big_server_hit_seq[i])
    acc = small_server_hit_seq[i] - big_server_hit_seq[i] + acc
    diff_small_and_big_seq_accumulation.append(acc)
    
#print(len(diff_small_and_big_seq))

In [None]:
plt.figure(figsize=(15,8))
#plt.plot(range(len(small_server_hit_ratio_syn_[4])), small_server_hit_ratio_syn_[4], color='red', label='little caches')
plt.plot(range(len(diff_small_and_big_seq_accumulation)), diff_small_and_big_seq_accumulation, color='red', label='big cache')
plt.xlabel("difference number")
plt.ylabel("different matching sequence")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
# Partial analysis
diff_small_and_big_seq_head = diff_small_and_big_seq[0:5000000]

plt.figure(figsize=(15,8))
#plt.plot(range(len(small_server_hit_ratio_syn_[4])), small_server_hit_ratio_syn_[4], color='red', label='little caches')
plt.plot(range(len(diff_small_and_big_seq_head)), diff_small_and_big_seq_head, color='red', label='big cache')
plt.xlabel("difference number")
plt.ylabel("different matching sequence")
plt.title("load balanced dispatcher")
plt.legend()

In [None]:
print(len(diff_small_and_big))

In [None]:
diff_small_and_big[23]

In [None]:
for i in range(20):
    print(small_server_hit_seq[i])

In [None]:
for i in range(20):
    print(diff_small_and_big[i])

In [None]:
for i in range(20):
    print(diff_small_and_big[len(diff_small_and_big)-i-1])