In [1]:
data_root = '/home/scott/Documents/SPEC_Spider'

In [2]:
!tree /home/scott/Documents/SPEC_Spider

[01;34m/home/scott/Documents/SPEC_Spider[0m
├── [01;34mcpu[0m
│   ├── [01;34mcpu2006[0m
│   │   ├── [00mSPECfp.csv[0m
│   │   ├── [00mSPECfp_rate.csv[0m
│   │   ├── [00mSPECint.csv[0m
│   │   └── [00mSPECint_rate.csv[0m
│   └── [01;34mcpu2017[0m
│       ├── [00mCFP2017_rate.csv[0m
│       ├── [00mCFP2017_speed.csv[0m
│       ├── [00mCINT2017_rate.csv[0m
│       └── [00mCINT2017_speed.csv[0m
├── [01;34mjava[0m
│   ├── [01;34mjbb2015[0m
│   │   ├── [00mSPECjbb2015-Composite.csv[0m
│   │   ├── [00mSPECjbb2015-Distributed.csv[0m
│   │   └── [00mSPECjbb2015-MultiJVM.csv[0m
│   └── [01;34mjvm2008[0m
│       └── [00mjvm2008.csv[0m
├── [00mjbb2015.csv[0m
└── [01;34mpower[0m
    └── [00mssj2008.csv[0m

7 directories, 14 files


In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
j2015_com = pd.read_csv(f"{data_root}/java/jbb2015/SPECjbb2015-Composite.csv")
j2015_dis = pd.read_csv(f"{data_root}/java/jbb2015/SPECjbb2015-Distributed.csv")
j2015_mul = pd.read_csv(f"{data_root}/java/jbb2015/SPECjbb2015-MultiJVM.csv")

In [5]:
jbb2015_columns = [
    'Suite', 'Vendor', 'System Name', 'max_jOPS', 'cirtical_jOPS',
    'Test date', 'Hardware Availability',
    'Total Systems', 'Total Nodes', 'Nodes Per System', 'Total Chips', 'Total Cores',
    'Total Threads', 'Total Memory Amount (GB)', 
    'CPU Name', 'CPU Characteristics', 'Number of Systems', 'Chips Per System',
    'Cores Per System', 'Cores Per Chip', 'Threads Per System',
    'Threads Per Core', 'CPU Frequency (MHz)', 'Primary Cache',
    'Secondary Cache', 'Tertiary Cache',
    'Disk', 'File System', 'Memory Amount (GB)', '# and size of DIMM(s)',
    'Memory Details', 'OS Name', 'OS Vendor', 'OS Version', 'JVM Name', 'JVM Vendor', 'JVM Version',
    'URL Suffix'
]

In [6]:
j2015_com = j2015_com[jbb2015_columns]
j2015_mul = j2015_mul[jbb2015_columns]
j2015_dis = j2015_dis[jbb2015_columns]

In [7]:
j2015 = pd.concat([j2015_com, j2015_mul, j2015_dis]).reset_index(drop=True)

In [9]:
j2015_rename_dict = {
    'Vendor': 'HW Vendor',
    'Test date': 'Test Date',
    'Hardware Availability': 'HW Avail',
    'CPU Frequency (MHz)': 'CPU MHz',
    'Primary Cache': 'L1',
    'Secondary Cache': 'L2',
    'Tertiary Cache': 'L3',
    'Disk': 'Storage',
}
j2015.rename(columns=j2015_rename_dict, inplace=True)

## Clean Vendor

In [11]:
def clean_vendor(vendor):
    def _clear(pattern):
        return re.sub(re.compile(pattern, re.IGNORECASE), '', vendor)
    vendor = vendor.strip()
    patterns = [
        ' *[(].*[)]',
        ',* *Ltd\.*$|,* *Inc\.*$',
        ',* *Co\.*$|,* *Corporation\.*$|,* *Corparation\.*$|,* *Corp\.*$| Incoporated$| Incorporated$| Incorporation$',
        ' International$',
        ' Computer[s]*$',
        ' Technology$'
    ]
    for pattern in patterns:
        vendor = _clear(pattern)
    
    replace_pairs = [
        ('^Huawei', 'Huawei'),
        ('^ASUS', 'ASUS'),
        ('^acer', 'Acer'),
        ('^Hewlett[ -]*Packard', 'HPE'),
        ('^Inspur', 'Inspur'),
        ('H3C', 'H3C'),
        ('^Giga[ -]*byte', 'Gigabyte'),
        ('^Fujitsu', 'Fujitsu'),
        ('^Hitachi', 'Hitachi'),
        ('^Lenovo', 'Lenovo'),
        ('^Quanta', 'Quanta'),
        ('^Super[ -]*Micro', 'SuperMicro'),
        ('^UNIWIDE', 'Uniwide'),
        ('^Wizbrain', 'Wizbrain'),
        ('^ScaleMP', 'ScaleMP'),
        ('^AMD', 'AMD'),
        ('Advanced Micro Devices', 'AMD'),
        ('^Hewelett-Packard', 'HPE'),
        ('^Oracl', 'Oracle'),
        ('^BEA', 'BEA'),
        ('^OpenJDK', 'OpenJDK'),
    ]
    for pair in replace_pairs:
        if len(re.findall(re.compile(pair[0], re.IGNORECASE), vendor)):
            vendor = pair[1]
    
    return vendor

In [13]:
j2015['HW Vendor'] = j2015['HW Vendor'].apply(lambda x: clean_vendor(x))

In [15]:
def parse_system_name(info):
    info = re.sub('[(].*[)]', '', info)
    info = re.sub('[(]|[)]', '', info)
    info = info.strip()
    info = info.split(',')[0]
    info = re.sub(' AMD.*?', '', info)
    info = re.sub(' Intel.*', '', info)
    info = re.sub('\d+.*\d*GHz$', '', info)
    info = re.sub('^vSMP ServerONE Supermicro ', '', info)
    info = info.strip()
    return info

In [19]:
j2015['System Name'] = j2015['System Name'].apply(lambda x: parse_system_name(x))

In [18]:
j2015['System Name'].unique().shape

(157,)

## Clean CPU

In [21]:
j2015.columns

Index(['Suite', 'HW Vendor', 'System Name', 'max_jOPS', 'cirtical_jOPS',
       'Test Date', 'HW Avail', 'Total Systems', 'Total Nodes',
       'Nodes Per System', 'Total Chips', 'Total Cores', 'Total Threads',
       'Total Memory Amount (GB)', 'CPU Name', 'CPU Characteristics',
       'Number of Systems', 'Chips Per System', 'Cores Per System',
       'Cores Per Chip', 'Threads Per System', 'Threads Per Core', 'CPU MHz',
       'L1', 'L2', 'L3', 'Storage', 'File System', 'Memory Amount (GB)',
       '# and size of DIMM(s)', 'Memory Details', 'OS Name', 'OS Vendor',
       'OS Version', 'JVM Name', 'JVM Vendor', 'JVM Version', 'URL Suffix'],
      dtype='object')

In [23]:
def get_cpu_vendor(cpu_name):
    item = cpu_name.split()[0]
    if item in ['Intel', 'AMD', 'Huawei']:
        vendor = item
    else:
        vendor = 'Other'
    return vendor
j2015['CPU Vendor'] = j2015['CPU Name'].apply(lambda x: get_cpu_vendor(x))

In [26]:
j2015['CPU MHz'] = j2015['CPU MHz'].apply(lambda x: round(x /1000, 2))

In [28]:
def parse_cpu_char(info):
    items = re.findall('\d+\.*\d*[ ]?G[ ]?Hz', info)
    if len(items) == 0:
        items = re.findall('\d+\.*\d*[ ]?M[ ]?Hz', info)
        if len(items) == 0:
            value = 0
        else:
            value = float(re.sub('[ ]?M[ ]?Hz', '', items[0]))
    else:
        value = float(re.sub('[ ]?G[ ]?Hz', '', items[0]))
    return value

In [31]:
j2015['Max MHz'] = j2015['CPU Characteristics'].apply(lambda x: parse_cpu_char(x))

In [33]:
f = j2015['Max MHz'] == 0.
indices = j2015[f].index
cpu_mhz = j2015.loc[indices, 'CPU MHz']
j2015.loc[indices, 'Max MHz'] = j2015.loc[indices, 'CPU MHz']

In [36]:
j2015['File System'] = j2015['File System'].apply(lambda x: x.lower())

In [48]:
def parse_memory_amount(item):
    if type(item) != int:
        if 'GB' in item:
            item = re.sub('GB', '', item)
        item = item.strip()
        item = int(item)
    return item
j2015['Memory Amount (GB)'] = j2015['Memory Amount (GB)'].apply(lambda x: parse_memory_amount(x))

In [51]:
j2015['Total Memory Amount (GB)'] = j2015['Total Memory Amount (GB)'].apply(lambda x: parse_memory_amount(x))

In [58]:
def get_memory_num(info):
    items = re.findall('\d+[ ]*x', info)
    if len(items) == 0:
        number = 1
    else:
        number = int(re.sub('[ ]*x', '', items[0]))
    return number

In [60]:
j2015['Memory Number'] = j2015['# and size of DIMM(s)'].apply(lambda x: get_memory_num(x))

In [61]:
j2015[['# and size of DIMM(s)', 'Memory Details', 'Memory Amount (GB)', 'Total Memory Amount (GB)', 'Memory Number']]

Unnamed: 0,# and size of DIMM(s),Memory Details,Memory Amount (GB),Total Memory Amount (GB),Memory Number
0,16 x 64 GB,64 GB 2Rx4 PC4-3200AA-R,1024,1024,16
1,4 x 16GB,16GB 2Rx8 PC4-2666V-E,64,64,4
2,4 x 16GB,16GB 2Rx8 PC4-2666V-E,64,64,4
3,4 x 16GB,16GB 2Rx8 PC4-2666V-E,64,64,4
4,24 x 32 GB,32GB 2Rx4 PC4-2666V-R,768,768,24
...,...,...,...,...,...
662,16 x 32768 MB,32 GB 2Rx4 PC4-2400T-R,512,512,16
663,48 x 16 GB,768 GB (48 x 16 GB 2Rx8 PC4-2666V),768,768,48
664,24 x 64 GB,1536 GB (24 x 64 GB 4Rx4 PC4-2666V-L),1536,1536,24
665,16 x 64 GB,64 GB 4Rx4 PC4-3200AA-L,1024,1024,16


In [63]:
def parse_storage(info):
    if 'SSD' in info.upper():
        storage_type = 'SSD'
    elif 'HDD' in info.upper():
        storage_type = 'HDD'
    elif 'ramfs' in info.lower():
        storage_type = 'ramfs'
    elif 'tmpfs' in info.lower():
        storage_type = 'tmpfs'
    elif 'zfs' in info.lower():
        storage_type = 'zfs'
    else:
        storage_type = 'SSD'
    return storage_type

In [65]:
j2015['Storage Type'] = j2015['Storage'].apply(lambda x: parse_storage(x))

In [67]:
j2015.columns

Index(['Suite', 'HW Vendor', 'System Name', 'max_jOPS', 'cirtical_jOPS',
       'Test Date', 'HW Avail', 'Total Systems', 'Total Nodes',
       'Nodes Per System', 'Total Chips', 'Total Cores', 'Total Threads',
       'Total Memory Amount (GB)', 'CPU Name', 'CPU Characteristics',
       'Number of Systems', 'Chips Per System', 'Cores Per System',
       'Cores Per Chip', 'Threads Per System', 'Threads Per Core', 'CPU MHz',
       'L1', 'L2', 'L3', 'Storage', 'File System', 'Memory Amount (GB)',
       '# and size of DIMM(s)', 'Memory Details', 'OS Name', 'OS Vendor',
       'OS Version', 'JVM Name', 'JVM Vendor', 'JVM Version', 'URL Suffix',
       'CPU Vendor', 'Max MHz', 'Memory Number', 'Storage Type'],
      dtype='object')

In [68]:
j2015[['OS Name', 'OS Version']]

Unnamed: 0,OS Name,OS Version
0,SUSE Linux Enterprise Server 15 SP2,5.3.18-22-default
1,SUSE Linux Enterprise Server 15,4.12.14-23-default
2,SUSE Linux Enterprise Server 15,4.12.14-23-default
3,SUSE Linux Enterprise Server 15,4.12.14-23-default
4,Red Hat Enterprise Linux Server 7.4,3.10.0-693.11.6.el7.x86_64
...,...,...
662,Red Hat Enterprise Linux Server 7.2,3.10.0-327
663,SUSE Linux Enterprise Server 12 SP2,4.4.21-69-default
664,Red Hat Enterprise Linux,Red Hat Enterprise Linux Server release 7.3 (M...
665,SUSE Linux Enterprise Server 15 SP2,5.3.18.22-default
