In [1]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import matplotlib.pyplot as plt

chunk_size_shift = 12 # 4KB

def load_data(csv_paths: str):
    headers = ['Timestamp','Hostname','DiskNumber','Type','Offset','Size','ResponseTime']
    frames = []
    for csv_path in csv_paths:
        df_part = pd.read_csv(csv_path, header=None, names=headers)
        frames.append(df_part)
    df = pd.concat(frames)
    return df

def filetime_to_datetime(ts):
    if str(ts) == "0":
        return ""
    try:
        dt = datetime(1601, 1, 1) + timedelta(microseconds=ts / 10)
    except OverflowError:
        return ts
    return dt.timestamp()

def process_data(df):
    df = df.loc[:, ['Timestamp', 'Type', 'Offset', 'Size']]
    df['Timestamp'] = df['Timestamp'].map(filetime_to_datetime)
    df['Type'] = df['Type'].map({'Read': 0, 'Write': 1})
    df['Offset'] = df['Offset'].map(lambda x: x >> chunk_size_shift) # bytes to 4KB block
    df['Size'] = df['Size'].map(lambda x: x >> 9) # bytes to 512B sector
    return df

def save_data(df, dst_path: str):
    processed = df.loc[:, ['Timestamp', 'Type', 'Offset', 'Size']]
    processed.to_csv(dst_path, index=False, header=False)
    
def plot_io_trace(df, save: bool, name: str):
    plt.rcParams['figure.figsize'] = (128.0, 128.0) # 设置figure_size尺寸
    plt.scatter(x=df['Timestamp'], y=df['Offset'], c=df['Type'], marker='.', cmap='coolwarm')
    if save:
        plt.savefig('./datasets/%s_%d_scatter.jpg' % (name, (1 << chunk_size_shift)))#保存图片
    else:
        plt.show()

In [2]:
datasets = {
    # 'hm': 2, 
    # 'mds': 2, 
    # 'prn': 2, 
    'proj': 5, 
    'prxy': 2, 
    # 'rsrch': 3,
    # 'src1': 3, 
    # 'src2': 3, 
    # 'stg': 2, 
    # 'ts': 1, 
    'usr': 3, 
    # 'wdev': 4, 
    # 'web': 4,
}

In [3]:
for k, v in datasets.items():
    srcs = []
    for i in range(v):
        srcs.append('./datasets/MSR-Cambridge/%s_%d.csv' % (k, i))
    dst = './datasets/%s_%d.csv' % (k, (1 << chunk_size_shift))
    
    df = load_data(srcs)
    df = process_data(df)
    plot_io_trace(df, True, k)
    save_data(df, dst)
    print(dst + ' saved')

./datasets/hm_4096.csv saved
./datasets/mds_4096.csv saved
./datasets/prn_4096.csv saved
./datasets/rsrch_4096.csv saved
./datasets/src1_4096.csv saved
./datasets/src2_4096.csv saved
./datasets/stg_4096.csv saved


In [None]:
df.head()

Unnamed: 0,Timestamp,Type,Offset,Size
0,1172183000.0,1,771665,8
1,1172183000.0,1,4003254,8
2,1172183000.0,1,128410,6
3,1172183000.0,1,770056,8
4,1172183000.0,1,786827,8
