In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import csv
import os
import random

## Generate and Cleanup Datasets

In [None]:
def split_dataset(dataset):
    """Split CSV file into training and testing"""
    
    df = pd.read_csv(dataset)
    msk = np.random.rand(len(df)) <= 0.7
    
    train = df[msk]
    test = df[~msk]
    train.to_csv(f"{dataset}_train", index=False)
    test.to_csv(f"{dataset}_test", index=False)

split_dataset("spec2006")

In [None]:
def combine_microbenchmarks(path, name, phase):
    """Combine individual benchmark files from microbenchmarks"""
    
    discard_files = ["clock.txt", "time.txt", ".DS_Store"]
    configs = {'LITTLE': "0", 'big': "1"}
    config_folders = os.listdir(path)
    config_folders = [folder for folder in config_folders if folder not in discard_files]
    dataframe_collections = []
    for config in config_folders:
        
        # Ensure path is a folder
        if not os.path.isdir(f"{path}/{config}/"):
            continue
            
        # Skip big config (for testing purposes)
        if config == "big":
            continue

        # List the files, each containing date points for a PMC
        data_files = os.listdir(f"{path}/{config}/")
        data_files = [file for file in data_files if file not in discard_files and file.endswith(".txt")]
        
        # Append column headers to CSV
        column_headers = [file.split('.')[0] for file in data_files]
        column_headers.append("cluster")
        column_headers.append("phase")
        
        # Insert data from individual file as a column
        df = pd.DataFrame(columns=column_headers)
        for file in data_files:
            with open(f"{path}/{config}/{file}", "r") as text_file:
                lines = text_file.read().splitlines()
                df[file.split('.')[0]] = lines
                df['cluster'] = configs[config]
                df['phase'] = phase
        df = df.iloc[100:] # skip first 'x' rows (first few values may not be representative of the phase)
        df = df.iloc[:-100] # skip last 'x' rows (last few values may not be representative of the phase)
        dataframe_collections.append(df)
        
        # Create and return CSV
        df.to_csv(f"{path}/{config}/{name}_{config}.csv", index=False)
        master_dataframes = pd.concat(dataframe_collections)
        master_dataframes.to_csv(f"{path}/{name}.csv", index=False)

    return master_dataframes

In [None]:
def combine_spec2006(path, name, phase):
    """Combine individual benchmark files for SPEC2006"""
    
    # Ensure path is valid
    if not os.path.isdir(path):
        return

    discard_files = ["dtlb_misses.txt", "l1_data_accesses.txt", "clock.txt", "time.txt", ".DS_Store"]
    configs = {'LITTLE': "0", 'big': "1"}
    config_folders = os.listdir(path)
    config_folders = [folder for folder in config_folders if folder not in discard_files]
    dataframe_collections = []
    for config in config_folders:
        
        # Ensure path is a folder
        if not os.path.isdir(f"{path}/{config}/"):
            continue
            
        # Skip big config (for testing purposes)
        if "big" in config:
            continue

        # List the files, each containing date points for a PMC
        data_files = os.listdir(f"{path}/{config}/")
        data_files = [file for file in data_files if file not in discard_files and file.endswith(".txt")]

        # Append column headers to CSV
        column_headers = [file.split('.')[0] for file in data_files]
        column_headers.append("cluster")
        column_headers.append("phase")

        # Insert data from individual file as a column
        df = pd.DataFrame(columns=column_headers)
        for file in data_files:
            with open(f"{path}/{config}/{file}", "r") as text_file:
                lines = text_file.read().splitlines()
                df[file.split('.')[0]] = lines
                df['cluster'] = "0" if "LITTLE" in config else "1"
                df['phase'] = phase
        df = df.iloc[200:] # skip first 'x' rows (first few values may not be representative of the phase)
        df = df.iloc[:-200] # skip last 'x' rows (last few values may not be representative of the phase)
        dataframe_collections.append(df)
        
        # Create and return CSV
        df.to_csv(f"{path}/{config}/{name}_{config}.csv", index=False)
        master_dataframes = pd.concat(dataframe_collections)
        master_dataframes.to_csv(f"{path}/{name}.csv", index=False)

    return master_dataframes

In [None]:
def generate_datasets_microbenchmarks():
    """Generate CSVs from raw PMC files from microbenchmarks"""
    
    path = "raw-data/phases"
    phase = {'cpu': "0", 'mem': "1"}
    benchmarks = os.listdir(path)
    dataframe_collections = []
    for benchmark in benchmarks:
        if not os.path.isdir(f"{path}/{benchmark}"):
            continue
                                        
        phase_folders = os.listdir(f"{path}/{benchmark}")
        for folder in phase_folders:
            folder_path = f"{path}/{benchmark}/{folder}"
            if folder in phase:
                df = combine_microbenchmarks(folder_path, folder, phase[folder])
                dataframe_collections.append(df)

        master_dataframes = pd.concat(dataframe_collections)
        master_dataframes.to_csv('datasets/micro_dataset.csv', index=False)

generate_datasets_microbenchmarks()

In [None]:
def generate_datasets_spec2006():
    """Generate CSVs from raw PMC files for SPEC2006"""
    
    # '2': phased, '1': memory-intensive, '0': cpu-intensive
    phases = {
        'astar': "2",
        'sjeng': "2",
        'leslie3d': "2",
        'GemsFDTD': "1",
        'mcf':  "1",
        'milc':  "1",
        'calculix':  "0",
        'gromacs': "0",
        'povray': "0",
    }
    path = "raw-data/mosse-spec2006"
    discard_files = [".DS_Store"]
    benchmarks = os.listdir(path)
    benchmarks = [b for b in benchmarks if b not in discard_files]
    cpu_benchmarks = ["povray", "gromacs", "calculix"]
    mem_benchmarks = ["GemsFDTD", "mcf", "milc"]
    # phased_benchmarks = ["astar", "sjeng", "leslie3d"]
    benchmarks = [val for pair in zip(cpu_benchmarks, mem_benchmarks) for val in pair]
    # benchmarks += phased_benchmarks
    dataframe_collections = []
    for benchmark in benchmarks:
        # Combine SPEC2006 benchmarks into a master CSV
        configuration_folders = os.listdir(f"{path}/{benchmark}")
        folder_path = f"{path}/{benchmark}"
        print(benchmark) # debug
        df = combine_spec2006(folder_path, benchmark, phases[benchmark])
        
        # Append to master dataframe
        dataframe_collections.append(df)
        master_dataframes = pd.concat(dataframe_collections)
        master_dataframes.to_csv('datasets/spec2006_dataset.csv', index=False)

generate_datasets_spec2006()

In [None]:
def combine_datasets(datasets):
    """Combine multiple datasets"""
    
    print(datasets)
    master = pd.concat([pd.read_csv(dataset) for dataset in datasets])      
    master.to_csv("training_master_dataset.csv", index=False)
    
datasets = ["smoothed_spec2006.csv", "smoothed_micro.csv"]
combine_datasets(datasets)

## Filter and Plot Datasets

In [None]:
from scipy.signal import savgol_filter

def filter_data(col):
    """Helper function to perform Savitzky–Golay filter"""
    return savgol_filter(col, 53, 1).astype(int)

def filter_benchmarks(dataset_name):
    """Filter datasets using Savitzky–Golay filter"""
    
    # Read dataset
    df = pd.read_csv(f"datasets/{dataset_name}_dataset.csv")
    smooth_cols = df.iloc[:,:-2]
    unsmooth_cols = df.iloc[:,-2:]
    
    # Filter the dataset using Savitzky-Golay smoothing filter
    smoothed_data = smooth_cols.apply(filter_data)
    smoothed_data = pd.concat([smoothed_data, unsmooth_cols], axis=1)
    smoothed_data.to_csv(f"datasets/smoothed_{dataset_name}.csv", index=False)
    
filter_benchmarks("spec2006")

In [None]:
def create_datasets(dataset):
    """Generate and filter dataset"""
    
    if dataset == "spec":
        generate_datasets_spec2006()
        filter_benchmarks("spec2006")
    else:
        generate_datasets_microbenchmarks()
        filter_benchmarks("micro")

create_datasets("spec")

## Plot Datasets

In [None]:
def plot_datasets(dataset):
    """Plot datasets"""
    
    df = pd.read_csv(f"datasets/{dataset}")
    plt.plot(df['instr'])
    
plot_datasets("smoothed_spec2006.csv")

In [None]:
def calculate_groundtruth(dataset):
    """Calculate ground-truth for applications"""
    
    df = pd.read_csv(dataset)
    
    master = pd.DataFrame()
    master['bus/instr'] = df['bus_accesses'] / df['instr']
    plt.plot(master['bus/instr'])
    
calculate_groundtruth("smoothed_spec2006.csv")
    

In [None]:
def calculate_difference(dataset):
    """Calculate difference between points"""
    
    # Read dataset
    dataset = pd.read_csv(dataset)
    discard = ["phase", "cluster"]
    pmcs = [pmc for pmc in dataset if pmc not in discard]

    # Calculate percentage difference
    master = pd.DataFrame()
    alpha = 0.9
    window_size = 100
    
    for pmc in dataset:
        if pmc == "cluster" or pmc == "phase":
            continue
        col = dataset[pmc]
        col_size = len(col)
        pct_change = []
        
        for i in range(window_size, col_size):
            prev_avg = ((sum(col[i - window_size:i]) / window_size) * (1-alpha))
            curr = (col[i] * alpha)
            
            # calculate percentage change
            change = (curr - prev_avg) / (prev_avg)
            pct_change.append(change)
        master[pmc] = pct_change
    
    for d in discard:
        master[d] = dataset[d]
    
    # Determine phase changes
    col = dataset['phase']
    col_size = len(dataset['phase'])
    phase_changes = []
    for i in range(window_size, col_size):
        col_window = col[i - window_size:i]
        phase_change = len(set(col_window)) > 1
        if phase_change == True:
            phase_changes.append("1")
        else:
            phase_changes.append("0")
            
    master['change'] = phase_changes
    
    # Write dataframe to CSV 
    master.to_csv('datasets/spec2006_pctchange.csv', index=False)
    
calculate_difference("datasets/smoothed_spec2006.csv")

## Model Training

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import pickle

def train_model():
    # Read dataset
    dataset = pd.read_csv("test2.csv")
    labels = dataset.iloc[:,-1:]
    features = dataset[['cycles', 'instr', 'llc_accesses', 'llc_misses', 'dtlb_misses']]

    # Split dataset into training and testing
    X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                        test_size=0.15, shuffle=True)

    # Scale data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Train SVM model
    """
    model = SVC(kernel="rbf")
    model.fit(X_train, y_train.values.ravel())
    """

    """
    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(X_train, y_train.values.ravel())
    """

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    # Test SVM model
    test_score = model.score(X_test, y_test.values.ravel())
    print(test_score)

    # Output model to pickle file
    pickle.dump(model, open("phases-model.pkl", "wb"))

    # Output scalr to pickle file
    pickle.dump(scaler, open("phases-scaler.pkl", "wb"))
    
train_model()

## Machine Learning Pipeline

## Model Testing

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pickle import load

# Read dataset
dataset = pd.read_csv("test1.csv")
labels = dataset.iloc[:,-1:]
# features = dataset.iloc[:,:-1]
features = dataset[['cycles', 'instr', 'llc_accesses', 'llc_misses', 'bus_accesses']]

# Load models 
model = load(open("phases-model.pkl", "rb"))
scaler = load(open("phases-scaler.pkl", "rb"))

# Scale data
features = scaler.transform(features)

# Test SVM model
test_score = model.score(features, labels)
print(test_score)