In [1]:
data_root = '/home/scott/Documents/SPEC_Spider'

In [2]:
!tree /home/scott/Documents/SPEC_Spider

[01;34m/home/scott/Documents/SPEC_Spider[0m
├── [01;34mcpu[0m
│   ├── [01;34mcpu2006[0m
│   │   ├── [00mSPECfp.csv[0m
│   │   ├── [00mSPECfp_rate.csv[0m
│   │   ├── [00mSPECint.csv[0m
│   │   └── [00mSPECint_rate.csv[0m
│   └── [01;34mcpu2017[0m
│       ├── [00mCFP2017_rate.csv[0m
│       ├── [00mCFP2017_speed.csv[0m
│       ├── [00mCINT2017_rate.csv[0m
│       └── [00mCINT2017_speed.csv[0m
├── [01;34mjava[0m
│   ├── [01;34mjbb2015[0m
│   │   ├── [00mSPECjbb2015-Composite.csv[0m
│   │   ├── [00mSPECjbb2015-Distributed.csv[0m
│   │   └── [00mSPECjbb2015-MultiJVM.csv[0m
│   └── [01;34mjvm2008[0m
│       └── [00mjvm2008.csv[0m
├── [00mjbb2015.csv[0m
└── [01;34mpower[0m
    └── [00mssj2008.csv[0m

7 directories, 14 files


In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
c2017_rfp = pd.read_csv(f"{data_root}/cpu/cpu2017/CFP2017_rate.csv")
c2017_sfp = pd.read_csv(f"{data_root}/cpu/cpu2017/CFP2017_speed.csv")
c2017_rint = pd.read_csv(f"{data_root}/cpu/cpu2017/CINT2017_rate.csv")
c2017_sint = pd.read_csv(f"{data_root}/cpu/cpu2017/CINT2017_speed.csv")

In [5]:
cpu2017_columns = [
    'Suite', 'Hardware Vendor', 'System Name', 'Baseline',
    'Test Date', 'HW Avail', 'CPU Name', 'Max MHz',
    'Nominal', 'Enabled', 'Orderable', 'L1',
    'L2', 'L3', 'Memory', 'Storage',
    'OS', 'File System', 'URL Suffix',
]
c2017_rfp = c2017_rfp[cpu2017_columns]
c2017_sfp = c2017_sfp[cpu2017_columns]
c2017_rint = c2017_rint[cpu2017_columns]
c2017_sint = c2017_sint[cpu2017_columns]

In [7]:
c2017 = pd.concat([c2017_rfp, c2017_sfp, c2017_rint, c2017_sint]).reset_index(drop=True)

In [8]:
c2017_rename_dict = {
    'Hardware Vendor': 'HW Vendor',
    'Nominal': 'CPU MHz',
    'Enabled': 'CPU Enabled',
    'Orderable': 'CPU Orderable',
    'L1': 'L1 Cache',
    'L2': 'L2 Cache',
    'L3': 'L3 Cache'
}
c2017.rename(columns=c2017_rename_dict, inplace=True)

In [9]:
c2017 = c2017[~c2017.isnull().any(axis=1)].reset_index(drop=True)

In [11]:
c2017.columns

Index(['Suite', 'HW Vendor', 'System Name', 'Baseline', 'Test Date',
       'HW Avail', 'CPU Name', 'Max MHz', 'CPU MHz', 'CPU Enabled',
       'CPU Orderable', 'L1 Cache', 'L2 Cache', 'L3 Cache', 'Memory',
       'Storage', 'OS', 'File System', 'URL Suffix'],
      dtype='object')

## Clean Vendor and System Name

In [10]:
def clean_vendor(vendor):
    def _clear(pattern):
        return re.sub(re.compile(pattern, re.IGNORECASE), '', vendor)
    vendor = vendor.strip()
    patterns = [
        ' *[(].*[)]',
        ',* *Ltd\.*$|,* *Inc\.*$',
        ',* *Co\.*$|,* *Corporation\.*$|,* *Corparation\.*$|,* *Corp\.*$| Incoporated$| Incorporated$| Incorporation$',
        ' International$',
        ' Computer[s]*$',
        ' Technology$'
    ]
    for pattern in patterns:
        vendor = _clear(pattern)
    
    replace_pairs = [
        ('^Huawei', 'Huawei'),
        ('^ASUS', 'ASUS'),
        ('^acer', 'Acer'),
        ('^Hewlett[ -]*Packard', 'HPE'),
        ('^Inspur', 'Inspur'),
        ('H3C', 'H3C'),
        ('^Giga[ -]*byte', 'Gigabyte'),
        ('^Fujitsu', 'Fujitsu'),
        ('^Hitachi', 'Hitachi'),
        ('^Lenovo', 'Lenovo'),
        ('^Quanta', 'Quanta'),
        ('^Super[ -]*Micro', 'SuperMicro'),
        ('^UNIWIDE', 'Uniwide'),
        ('^Wizbrain', 'Wizbrain'),
        ('^ScaleMP', 'ScaleMP'),
        ('^AMD', 'AMD'),
        ('Advanced Micro Devices', 'AMD'),
        ('^Hewelett-Packard', 'HPE'),
        ('^Oracl', 'Oracle'),
        ('^BEA', 'BEA'),
        ('^OpenJDK', 'OpenJDK'),
    ]
    for pair in replace_pairs:
        if len(re.findall(re.compile(pair[0], re.IGNORECASE), vendor)):
            vendor = pair[1]
    
    return vendor

In [13]:
c2017['HW Vendor'] = c2017['HW Vendor'].apply(lambda x: clean_vendor(x))

In [69]:
def parse_system_name(info):
    info = re.sub('[(].*[)]', '', info)
    info = re.sub('[(]|[)]', '', info)
    info = info.strip()
    info = info.split(',')[0]
    info = re.sub(' AMD.*?', '', info)
    info = re.sub(' Intel.*', '', info)
    info = re.sub('\d+.*\d*GHz$', '', info)
    info = re.sub('^vSMP ServerONE Supermicro ', '', info)
    info = info.strip()
    return info
c2017['System Name'] = c2017['System Name'].apply(lambda x: parse_system_name(x))

## Clean CPU

In [70]:
def get_cpu_vendor(cpu_name):
    item = cpu_name.split()[0]
    if item in ['Intel', 'AMD', 'Huawei']:
        vendor = item
    else:
        vendor = 'Other'
    return vendor
c2017['CPU Vendor'] = c2017['CPU Name'].apply(lambda x: get_cpu_vendor(x))

In [71]:
c2017['Max MHz'] = c2017['Max MHz'].apply(lambda x: round(x / 1000, 2))
c2017['CPU MHz'] = c2017['CPU MHz'].apply(lambda x: round(x / 1000, 2))

In [72]:
def parse_cpu_enabled(info):
    items = info.split(',')
    threads_per_core = 1
    total_cores = int(items[0].split()[0])
    chips = int(items[1].split()[0])
    if len(items) == 3:
        threads_per_core = (items[-1].split()[0])
    return pd.Series({
        'Total Cores': total_cores,
        'Chips': chips,
        'Threads Per Core': threads_per_core,
    })
c2017[['Total Cores', 'Chips', 'Threads Per Core']] = c2017['CPU Enabled'].apply(lambda x: parse_cpu_enabled(x))

In [73]:
c2017['Cores Per Chip'] = c2017.apply(lambda item: item['Total Cores'] // item['Chips'], axis=1)

In [104]:
def parse_cpu_orderable(info):
    info = info.split(';')[0]
    items = re.findall('.*chip', info)
    number = 1
    if len(items):
        nums = [int(num) for num in re.findall(r'\d+', items[0])]
        number = nums[-1]
    return number
c2017['Max Chips'] = c2017['CPU Orderable'].apply(lambda x: parse_cpu_orderable(x))

In [75]:
c2017['File System'] = c2017['File System'].apply(lambda x: x.lower())

In [76]:
def parse_memory(info):
    total_memory_amount = int(info.split()[0])
    unit = info.split()[1]
    if 'T' in unit:
        total_memory_amount *= 1024
    items = re.findall(r'\d+ x', info)
    nums = [int(item.split()[0]) for item in items]
    memory_num = sum(nums)
    memory_detail = re.findall(r'[(].*?[)]', info)[0][1:-1]
    return pd.Series({
        'Total Memory Amount': total_memory_amount,
        'Memory Number': memory_num,
        'Memory Detail': memory_detail,
    })
c2017[['Total Memory Amount', 'Memory Number']] = c2017['Memory'].apply(lambda x: parse_memory(x))  

In [77]:
c2017['Memory Amount'] = c2017.apply(lambda item: item['Total Memory Amount'] // item['Memory Number'], axis=1)

In [78]:
def parse_storage(info):
    items = re.findall(r'\d+[ ]*x', info)
    storage_num = 1
    if len(items) != 0:
        storage_num = items[0][:-1].strip()
    storage_num = int(storage_num)
    items = re.findall(r'\d+[.]?\d*[ ]*TB?', info)
    if len(items):
        storage_size = float(re.findall(r'\d+\.?\d*', items[0])[0])
        storage_size = storage_size * 1024
    else:
        items = re.findall(r'\d+[.]*\d*[ ]*GB?', info)
        storage_size = float(re.findall(r'\d+\.?\d*', items[0])[0])
    if 'SSD' in info.upper():
        storage_type = 'SSD'
    elif 'HDD' in info.upper():
        storage_type = 'HDD'
    elif 'ramfs' in info.lower():
        storage_type = 'ramfs'
    elif 'tmpfs' in info.lower():
        storage_type = 'tmpfs'
    elif 'zfs' in info.lower():
        storage_type = 'zfs'
    else:
        storage_type = 'SSD'
    return pd.Series({
        'Storage Number': storage_num,
        'Storage Size': storage_size,
        'Storage Type': storage_type
    })

c2017[['Storage Number', 'Storage Size', 'Storage Type']] = c2017['Storage'].apply(lambda x: parse_storage(x))

In [79]:
c2017['Total Storage Size'] = c2017['Storage Number'] * c2017['Storage Size']

## Clean Date

In [86]:
month_mapper = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May':5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
re_month_mapper = {v:k for k, v in month_mapper.items()}

re_month_mapper

{1: 'Jan',
 2: 'Feb',
 3: 'Mar',
 4: 'Apr',
 5: 'May',
 6: 'Jun',
 7: 'Jul',
 8: 'Aug',
 9: 'Sep',
 10: 'Oct',
 11: 'Nov',
 12: 'Dec'}

In [102]:
def split_date_1(item):
    """
    May-2017
    2014.3
    2010
    """
    item = item.strip()
    d = 1
    if '-' in item:
        m, y = item.split('-')
    elif '.' in item:
        y, m = item.split('.')
        m = re_month_mapper[int(m)]
    elif ' ' in item:
        m, y = item.split()
    else:
        y = item
        m = 'Jan'
    m = month_mapper[m[:3]]
    y = int(y)
    return pd.Series({'day': d, 'month': m, 'year': y})

In [81]:
def split_date_2(item):
    """
    May 1, 2018
    2009/05/19
    12.02.2009
    """
    if ',' in item:
        md, y = item.split(',')
        m, d = md.split()
    elif '/' in item:
        y, m, d = item.split('/')
        m = re_month_mapper[int(m)]
    elif '.' in item:
        d, m, y = item.split('.')
        m = re_month_mapper[int(m)]
    m = m[:3]
    if m == 'Spe':
        m = 'Sep'
    m = month_mapper[m[:3]]
    d, m, y = int(d), int(m), int(y)
    return pd.Series({'day': d, 'month': m, 'year': y})

In [82]:
def get_quarter(url):
    items = url.split('/')[0]
    return pd.Series({'quarter': int(items[-1]), 'submit_year': int(items[3:7])})

In [84]:
c2017.columns

Index(['Suite', 'HW Vendor', 'System Name', 'Baseline', 'Test Date',
       'HW Avail', 'CPU Name', 'Max MHz', 'CPU MHz', 'CPU Enabled',
       'CPU Orderable', 'L1 Cache', 'L2 Cache', 'L3 Cache', 'Memory',
       'Storage', 'OS', 'File System', 'URL Suffix', 'CPU Vendor',
       'Total Cores', 'Chips', 'Threads Per Core', 'Cores Per Chip',
       'Max Chips', 'Total Memory Amount', 'Memory Number', 'Memory Detail',
       'Memory Amount', 'Storage Number', 'Storage Size', 'Storage Type',
       'Total Storage Size'],
      dtype='object')

In [89]:
c2017[['day', 'month', 'year']] = c2017['Test Date'].apply(lambda x: split_date_1(x))

In [91]:
c2017[['quarter', 'submit_year']] = c2017['URL Suffix'].apply(lambda x: get_quarter(x))

In [93]:
c2017.columns

Index(['Suite', 'HW Vendor', 'System Name', 'Baseline', 'Test Date',
       'HW Avail', 'CPU Name', 'Max MHz', 'CPU MHz', 'CPU Enabled',
       'CPU Orderable', 'L1 Cache', 'L2 Cache', 'L3 Cache', 'Memory',
       'Storage', 'OS', 'File System', 'URL Suffix', 'CPU Vendor',
       'Total Cores', 'Chips', 'Threads Per Core', 'Cores Per Chip',
       'Max Chips', 'Total Memory Amount', 'Memory Number', 'Memory Detail',
       'Memory Amount', 'Storage Number', 'Storage Size', 'Storage Type',
       'Total Storage Size', 'day', 'month', 'year', 'quarter', 'submit_year'],
      dtype='object')

In [99]:
c2017['HW Avail'].unique()

array(['Apr-2021', 'Apr-2019', 'Feb-2020', 'Sep-2020', 'Oct-2020',
       'Nov-2020', 'Dec-2020', 'Dec-2017', 'Jul-2021', 'Mar-2022',
       'Mar-2021', 'Sep-2021', 'Aug-2019', 'May-2021', 'Sep-2019',
       'Apr-2020', 'Jul-2020', 'May-2019', 'Nov-2018', 'Oct-2019',
       'Jul-2017', 'Oct-2018', 'Sep-2018', 'Dec-2018', 'Jul-2018',
       'Apr-2018', 'Mar-2017', 'Oct-2017', 'Jun-2017', 'Sep-2017',
       'Oct-2016', 'Nov-2019', 'Jun-2019', 'Jan-2020', 'Oct-2015',
       'Feb-2007', 'Jun-2021', 'Mar-2020', 'Mar-2019', 'Aug-2020',
       'May-2020', 'Dec-2019', 'Jul-2019', 'Nov-2017', 'Apr-2017',
       'Jan-2019', 'Aug-2017', 'Jan-2018', 'Jun-2018', 'May-2022',
       'Apr-2022', 'Nov-2021', 'Jun-2020', 'Mar-2018', 'May-2018',
       'Apr-2016', 'Jan-2016', 'Jan-2022', 'Aug-2018', 'Feb-2018',
       ' Jan-2022', 'Feb-2019', 'Jun-2016', 'Oct-2021', 'Aug-2021',
       'May-2017', 'Dec-2016', 'Dec-2021', ' Jun-2021', 'Aug-2016'],
      dtype=object)

In [103]:
c2017['HW Avail'].apply(lambda x: split_date_1(x))`

Unnamed: 0,day,month,year
0,1,4,2021
1,1,4,2021
2,1,4,2021
3,1,4,2021
4,1,4,2021
...,...,...,...
27569,1,2,2018
27570,1,9,2017
27571,1,9,2017
27572,1,7,2017
