In [None]:
import pandas as pd
from datetime import datetime,timedelta
import matplotlib.pyplot as plt

def load_data(csv_paths: str):
    headers = ['Timestamp','Hostname','DiskNumber','Type','Offset','Size','ResponseTime']
    frames = []
    for csv_path in csv_paths:
        df_part = pd.read_csv(csv_path, header=None, names=headers)
        frames.append(df_part)
    df = pd.concat(frames)
    return df

def filetime_to_datetime(ts):
    if str(ts) == "0":
        return ""
    try:
        dt = datetime(1601, 1, 1) + timedelta(microseconds=ts / 10)
    except OverflowError:
        return ts
    return dt.timestamp()

def process_data(df):
    df = df.loc[:, ['Timestamp', 'Type', 'Offset', 'Size']]
    df['Timestamp'] = df['Timestamp'].map(filetime_to_datetime)
    df['Type'] = df['Type'].map({'Read': 0, 'Write': 1})
    df['Offset'] = df['Offset'].map(lambda x: x >> 14)
    return df

def save_data(df, dst_path: str):
    processed = df.loc[:, ['Type', 'Offset', 'Size']]
    processed.to_csv(dst_path, index=False, header=False)
    
def plot_io_trace(df, save: bool, name: str):
    plt.rcParams['figure.figsize'] = (128.0, 128.0) # 设置figure_size尺寸
    plt.scatter(x=df['Timestamp'], y=df['Offset'], c=df['Type'], marker='.', cmap='coolwarm')
    if save:
        plt.savefig('./datasets/%s_scatter.jpg' % name)#保存图片
    else:
        plt.show()

In [2]:
datasets = {
    'hm': 2, 'mds': 2, 'prn': 2, 'proj': 5, 'prxy': 2, 'rsrch': 3,
    'src1': 3, 'src2': 3, 'stg': 2, 'ts': 1, 'usr': 3, 'wdev': 4, 'web': 4,
}
tasks = []
for k, v in datasets.items():
    srcs = []
    for i in range(v):
        srcs.append('./datasets/MSR-Cambridge/%s_%d.csv' % (k, i))
    tasks.append([srcs, './datasets/%s_processed.csv' % k])

In [None]:
for k, v in datasets.items():
    srcs = []
    for i in range(v):
        srcs.append('./datasets/MSR-Cambridge/%s_%d.csv' % (k, i))
    dst = './datasets/%s_processed.csv' % k
    
    df = load_data(srcs)
    df = process_data(df)
    plot_io_trace(df, True, k)
    save_data(df, dst)
    print(dst + ' saved')