In [1]:
data_root = '/home/scott/Documents/SPEC_Spider'

In [2]:
!tree /home/scott/Documents/SPEC_Spider

[01;34m/home/scott/Documents/SPEC_Spider[0m
├── [01;34mcpu[0m
│   ├── [01;34mcpu2006[0m
│   │   ├── [00mSPECfp.csv[0m
│   │   ├── [00mSPECfp_rate.csv[0m
│   │   ├── [00mSPECint.csv[0m
│   │   └── [00mSPECint_rate.csv[0m
│   └── [01;34mcpu2017[0m
│       ├── [00mCFP2017_rate.csv[0m
│       ├── [00mCFP2017_speed.csv[0m
│       ├── [00mCINT2017_rate.csv[0m
│       └── [00mCINT2017_speed.csv[0m
├── [01;34mjava[0m
│   ├── [01;34mjbb2015[0m
│   │   ├── [00mSPECjbb2015-Composite.csv[0m
│   │   ├── [00mSPECjbb2015-Distributed.csv[0m
│   │   └── [00mSPECjbb2015-MultiJVM.csv[0m
│   └── [01;34mjvm2008[0m
│       └── [00mjvm2008.csv[0m
├── [00mjbb2015.csv[0m
└── [01;34mpower[0m
    └── [00mssj2008.csv[0m

7 directories, 14 files


In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
jvm2008 = pd.read_csv(f"{data_root}/java/jvm2008/jvm2008.csv")

In [5]:
jvm2008.columns

Index(['Suite', 'Result', 'Submitter', 'SPEC license', 'Test date:',
       'HW vendor', 'HW model', 'HW available', 'CPU vendor', 'CPU name',
       'CPU frequency', '# of logical cpus', '# of chips', '# of cores',
       'Cores per chip', 'Threads per core', 'Threading enabled',
       'HW address bits', 'Primary cache', 'Secondary cache', 'Other cache',
       'Memory size', 'Memory details', 'Other HW details', 'OS name',
       'OS available', 'Filesystem', 'JVM name', 'JVM version',
       'JVM available', 'JVM Vendor', 'URL Suffix'],
      dtype='object')

In [6]:
jvm2008_columns = [
    'Suite', 'HW vendor', 'HW model', 'Result',
    'Test date:', 'HW available', 'CPU name', 'CPU vendor',
    'CPU frequency', '# of logical cpus', '# of chips', '# of cores',
    'Cores per chip', 'Threads per core',
    'Primary cache', 'Secondary cache',
    'Memory size', 'Memory details',
    'OS name', 'Filesystem', 'JVM name', 'JVM version',
    'URL Suffix'
]
jvm2008 = jvm2008[jvm2008_columns]

In [30]:
def get_memory_number(memory_info):
    items = re.findall(r'\d+[ ]*x|\d+[ ]*\*', memory_info)
    nums = [int(re.sub('[ ]*x|[ ]*\*', '', item)) for item in items]
    memory_num = sum(nums)
    if memory_num == 0:
        memory_num = 1
    return memory_num
jvm2008['Memory'].apply(lambda x: get_memory_number(x))

0     32
1     16
2     12
3     32
4     32
5     32
6      2
7      2
8      2
9     12
10    16
11    16
Name: Memory, dtype: int64

In [25]:
jvm2008['Memory']

0      32 x 8 GB 1Rx4 PC3L-12800R-11
1        16 x 16 GB 2Rx4 PC4-2133P-R
2              12*4GB DDR3-1333 DIMM
3            32x 8 GB DDR3-1066 DIMM
4            32x 8 GB DDR3-1066 DIMM
5            32x 8 GB DDR3-1066 DIMM
6               2 x 2GB DDR3 1067MHz
7               2 x 2GB DDR3 1067MHz
8               2 x 2GB DDR3 1067MHz
9             12x 4GB DDR3-1066 DIMM
10    16 x 4GB 2Rx4 PC2-5300F FBDIMM
11    16 x 4GB 2Rx4 PC2-5300F FBDIMM
Name: Memory, dtype: object

In [7]:
jvm2008_rename_dict = {
    'HW model': 'System Name',
    'Test date:': 'Test Date',
    'HW available': 'HW Avail',
    'CPU name': 'CPU Name',
    'CPU vendor': 'CPU Vendor',
    'CPU frequency': 'CPU MHz',
    'Cores oer chip': 'Cores Per Chips',
    'Threads per core': 'Threads Per Core',
    'Primary cache': 'L1 Cache',
    'Secondary cache': 'L2 Cache',
    'Memory size': 'Memory Size',
    'Memory details': 'Memory',
    'OS name': 'OS',
    'Filesystem': 'File System',
    'JVM name': 'JVM Name',
    'JVM version': 'JVM Version',
}

In [9]:
jvm2008.rename(columns=jvm2008_rename_dict, inplace=True)

In [10]:
jvm2008['Cores Per Chip'] = jvm2008['# of cores'] // jvm2008['# of chips']

In [18]:
jvm2008[['CPU Name', 'CPU MHz']].values

array([['Intel Xeon E7-4830 v2(Intel Turbo Boost Technology up to 2.70 GHz)',
        '2200'],
       ['Intel Xeon E5-2660 v3(Intel Turbo Boost Technology up to 3.30 GHz)',
        '2600'],
       ['Intel Xeon E5645 (Intel Turbo Boost Technology up to 2.67GHz)',
        '2.4 GHz'],
       ['SPARC T4', '2848'],
       ['SPARC T4', '2848'],
       ['SPARC T3', '1.65 GHz'],
       ['Intel Core 2 Duo CPU E8335', '2930'],
       ['Intel Core 2 Duo CPU E8335', '2930'],
       ['Intel Core 2 Duo CPU E8335', '2930'],
       ['Intel Xeon X5570 (Intel Turbo Boost Technology up to 3.33GHz)',
        '2.93 GHz'],
       ['Intel Xeon X7460 Quad Core (1066 MHz system bus)', '2667'],
       ['Intel Xeon X7350 Quad Core (1066 MHz system bus)', '2933']],
      dtype=object)

In [21]:
def format_cpu_name(info):
    return re.sub('[(].*?[)]', '', info)
jvm2008['CPU Name'].apply(lambda x: format_cpu_name(x))

0           Intel Xeon E7-4830 v2
1           Intel Xeon E5-2660 v3
2               Intel Xeon E5645 
3                        SPARC T4
4                        SPARC T4
5                        SPARC T3
6      Intel Core 2 Duo CPU E8335
7      Intel Core 2 Duo CPU E8335
8      Intel Core 2 Duo CPU E8335
9               Intel Xeon X5570 
10    Intel Xeon X7460 Quad Core 
11    Intel Xeon X7350 Quad Core 
Name: CPU Name, dtype: object

In [20]:
def _func(info):
    if 'GHz' in info:
        info = re.sub('[ ]*GHz', '', info)
        val = float(info)
    else:
        val = float(info) / 1000
    val = round(val, 2)
    return val
jvm2008['CPU MHz'].apply(lambda x: _func(x))

0     2.20
1     2.60
2     2.40
3     2.85
4     2.85
5     1.65
6     2.93
7     2.93
8     2.93
9     2.93
10    2.67
11    2.93
Name: CPU MHz, dtype: float64

In [22]:
jvm2008['Memory Size']

0     256 GB
1     256 GB
2       48GB
3     262144
4     262144
5     256 GB
6     4096MB
7     4096MB
8     4096MB
9       48GB
10     65536
11     65536
Name: Memory Size, dtype: object

In [24]:
def get_memory_amount(info):
    items = re.findall('\d+\.*\d*[ ]?GB', info)
    value = 0
    if len(items) == 0:
        items = re.findall('\d+\.*\d*[ ]?MB', info)
        if len(items):
            info = re.sub('[ ]*MB', '', info)
        value = float(info) / 1024
    else:
        info = re.sub('[ ]*GB', '', info)
        value = float(info)
    value = round(value, 2)
    return value
jvm2008['Memory Size'].apply(lambda x: get_memory_amount(x))

0     256.0
1     256.0
2      48.0
3     256.0
4     256.0
5     256.0
6       4.0
7       4.0
8       4.0
9      48.0
10     64.0
11     64.0
Name: Memory Size, dtype: float64

In [13]:
jvm2008['Cores Per Chip'] * jvm2008['Threads Per Core'] * jvm2008['# of chips'] == jvm2008['# of logical cpus']

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
dtype: bool

In [14]:
jvm2008[['Cores Per Chip', 'Threads Per Core', '# of chips', '# of logical cpus']]

Unnamed: 0,Cores Per Chip,Threads Per Core,# of chips,# of logical cpus
0,10,1,4,40
1,10,2,2,40
2,6,2,2,"24 (2 chips, 6 cores/chip, 2 threads/core)"
3,8,8,2,128
4,8,8,2,128
5,16,8,2,256
6,2,1,1,2
7,2,1,1,2
8,2,1,1,2
9,4,2,2,"16 (2 chips, 4 cores/chip, 2 threads/core)"


In [34]:
jvm2008.isnull().any(axis=1)

0     False
1      True
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
dtype: bool

In [35]:
jvm2008.loc[1]

Suite                                                      SPECjvm2008
HW vendor                                                        Sugon
System Name                      Sugon I620-G20(Intel Xeon E5-2660 v3)
Result                                                          853.15
Test Date                                 Thu Dec 25 19:18:18 CST 2014
HW Avail                                                       2014.09
CPU Name             Intel Xeon E5-2660 v3(Intel Turbo Boost Techno...
CPU Vendor                                                       Intel
CPU MHz                                                           2600
# of logical cpus                                                   40
# of chips                                                           2
# of cores                                                          20
Cores per chip                                                      10
Threads Per Core                                                     2
L1 Cac

In [None]:
jvm2008['Cores per chip']

In [11]:
jvm2008[~(jvm2008['# of cores'] == jvm2008['Cores per chip'] * jvm2008['# of chips'])][['# of cores', 'Cores per chip', '# of chips']]

Unnamed: 0,# of cores,Cores per chip,# of chips
0,40,1,4


In [None]:
jvm2008['# of logical cpus'] == jvm2008['Threads per core'] * jvm2008['# of cores']

In [None]:
jvm2008_rename_dict = {
    'HW model': 'System Name',
    'Result': 'Benchmark',
    'Test date:': 'Test Date',
    'HW available': 'HW Avail',
    'CPU name': 'CPU Name',
    'CPU vendor': 'CPU Vendor',
    'CPU frequency': 'CPU MHz',
    
}