In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import json
import matplotlib.pyplot as plt
import random
from sklearn.metrics import accuracy_score
from devtools import pprint 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from helper import one_hot_encoding
from tqdm import tqdm
random_state = 42
random.seed(random_state)

# Useful functions 
A couple of function used to: 
- `train_validation_test_split`: split the dataset into train,validation and test
- `get_time_matrix`: get the time values of each combination in a matrix

In [3]:
def train_validation_test_split(x, y, weights = [], test_buckets = []):
    BUCKETS = 10

    N_ELEMENTS = len(x)

    BUCKET_SIZE = N_ELEMENTS // BUCKETS

    TEST_BUCKETS = 1


    x_local = x.copy()
    y_local = y.copy()
    weights = [w for w in weights] if len(weights) == len(y) else [1 for _ in range(len(y))]
    x_test, y_test = [], []
    #If the test bucket(s) has not been provided, generate it randomly
    if len(test_buckets) == 0: 
        for _ in range(TEST_BUCKETS):
            idx = random.randint(0, BUCKETS - 1)
            while idx in test_buckets:
                idx = random.randint(0, BUCKETS - 1)
            test_buckets.append(idx)

    #Generate the test set
    for bucket in test_buckets:
        idx = bucket * BUCKET_SIZE
        for _ in range(BUCKET_SIZE):
            weights.pop(idx)
            x_test.append(x_local.pop(idx))
            y_test.append(y_local.pop(idx))

    #generate the train set as 90% of the remaining dataset
    train_elements = (len(y_local) // 10) * 9
    x_train = x_local[:train_elements]
    y_train = y_local[:train_elements]
    train_weights = weights[:train_elements]

    #generate the valdation set as 10% of the validation data
    x_validation = x_local[train_elements:]
    y_validation = y_local[train_elements:]
    
    return (x_train, y_train, train_weights), (x_validation, y_validation), (x_test, y_test)

def get_time_matrix(shape:tuple, times:list[list[dict]]):
    time_matrix = np.zeros(shape)
    for i in range(len(times)):
        times_i = times[i]
        for j in range(len(times_i)):
            time_matrix[i,j] = times_i[j]["time"]
    return time_matrix

# Data loading and preparation

In [4]:
features_dir = "../features"
files = [f for f in os.listdir(features_dir) if os.path.isfile(os.path.join(features_dir, f))]
data = {}
#load all the features in memory
for file in files:
    with open(features_dir + "/" + file) as f:
        data[file] = json.load(f)
splitted = [file.replace(".json", "").replace(".param", "").split("_") for file in files]
models = np.unique(["_".join(split[:-1]) for split in splitted]).tolist()
instances = np.unique([split[-1] for split in splitted]).tolist()
models = ["01_compact.eprime"]
instances_data = {}
#save the features as model that generates them - instance
for instance in instances:
    for model in models:
        instances_data[instance] = data[f"{model}_{instance}.json"]

In [5]:
#open the dataset
f = open("../data/datasets/dataset_CarSequencing-2024-03-19.json")
dataset = json.load(f)
f.close()

In [6]:
for datapoint in dataset:
    instance = datapoint["instance_name"].split("/")[-1].replace(".param", ".fnz2feat")
    datapoint["features"] = instances_data[instance]
    datapoint["all_times"] = sorted(datapoint["all_times"], key= lambda x: x["combination"])

In [7]:
data = pd.DataFrame(dataset)
data.head()

Unnamed: 0,combination,time,instance_value,instance_name,all_times,features
0,cplex_06_chPrunedLevels.eprime,3.18,language Essence 1.3\n\nletting blksize_delta ...,params/generated/c28b092a8f728601da45adef533a2...,"[{'time': 41.64, 'combination': 'chuffed_01_co...","{'c_avg_deg_cons': 8.55327, 'c_avg_dom_cons': ..."
1,or-tools-1_06_chPrunedLevels.eprime,1.85,language Essence 1.3\n\nletting blksize_delta ...,params/generated/bda2c9118004a4f49b43c1d56d23b...,"[{'time': 64.38, 'combination': 'chuffed_01_co...","{'c_avg_deg_cons': 5.84957, 'c_avg_dom_cons': ..."
2,chuffed_01_compact.eprime,12.48,language Essence 1.3\n\nletting blksize_delta ...,params/generated/33ee22ee78354fcd6547ae283d854...,"[{'time': 12.48, 'combination': 'chuffed_01_co...","{'c_avg_deg_cons': 4.71617, 'c_avg_dom_cons': ..."
3,or-tools-1_01_compact.eprime,1.57,language Essence 1.3\n\nletting blksize_delta ...,params/generated/6549d86c3e82a1cc9ad6d047b8b70...,"[{'time': 64.16, 'combination': 'chuffed_01_co...","{'c_avg_deg_cons': 5.35605, 'c_avg_dom_cons': ..."
4,chuffed_01_compact.eprime,12.94,language Essence 1.3\n\nletting blksize_delta ...,params/generated/7b01e73519f42cb521ac7f638a98a...,"[{'time': 12.94, 'combination': 'chuffed_01_co...","{'c_avg_deg_cons': 3.91856, 'c_avg_dom_cons': ..."


In [17]:
feat = []
times = []
for i in range(len(data)):
    row = data.iloc[i,:]
    current_feat = {"inst": row["instance_name"]}
    current_feat.update(row["features"])
    feat.append(current_feat)
    current_times = {"inst": row["instance_name"]}
    for time in row["all_times"]:
        current_times[time["combination"]] = time["time"]
    times.append(current_times)

features = pd.DataFrame(feat)
features.set_index('inst', inplace=True)
times = pd.DataFrame(times)
times.set_index('inst', inplace=True)
features.to_csv("../data/datasets/dataset_CarSequencing_features-2024-03-19.csv")
times.to_csv("../data/datasets/dataset_CarSequencing_times-2024-03-19.csv")

In [21]:
X = data["features"]
X = [x for x in X.values]
weights = [sum([1 if t["time"] > 3600 else 0 for t in times]) for times in data["all_times"].to_list()]
y = [datapoint for datapoint in data["combination"].to_list()]
combinations = [d["combination"] for d in data["all_times"].tolist()[0]]
y = one_hot_encoding(y, combinations)
y  = [y_t.numpy() for y_t in y]

all_times = [datapoint["all_times"] for datapoint in dataset]
times_matrix = get_time_matrix((len(all_times), len(combinations)), all_times)
y_np = np.array(y)
majority_index = np.argmax([np.sum(y_np[:, i]) for i in range(len(combinations))])

bucket_size = len(X) // 10

# Training a randomForest classifier

In [22]:
stats = []

#repeat the experiment for different train/validation/test splits
for test_bucket in tqdm(range(10), "training on different dataset splits"):
    (X_train, y_train, _), (X_validation, y_validation), (X_test, y_test) = train_validation_test_split(X,y, weights=weights, test_buckets=[test_bucket])
    X_train, X_validation, X_test = pd.DataFrame(X_train), pd.DataFrame(X_validation), pd.DataFrame(X_test)

    test_start = test_bucket * bucket_size
    test_idxs = [test_start + k for k in range(len(y_test))]
    train_val_idxs = [k for k in range(len(y)) if not k in test_idxs]
    train_elements = (len(train_val_idxs) // 10) * 9
    train_idxs = train_val_idxs[:train_elements]
    validation_idxs = train_val_idxs[train_elements:]

    assert len(test_idxs) == len(y_test)
    assert len(validation_idxs) == len(y_validation)
    assert len(train_idxs) == len(y_train)

    #Virtual best
    min_train = sum([min(times_matrix[i, :]) for i in train_idxs])
    min_val = sum([min(times_matrix[i, :]) for i in validation_idxs])
    min_test = sum([min(times_matrix[i, :]) for i in test_idxs])

    #majority classifier: AKA single best with highest wins
    majority_train = sum([times_matrix[i, majority_index] for i in train_idxs])
    majority_val = sum([times_matrix[i, majority_index] for i in validation_idxs])
    majority_test = sum([times_matrix[i, majority_index] for i in test_idxs])

    #single best with lowest time
    sb_t = [sum([times_matrix[i, j] for i in train_idxs]) for j in range(len(combinations))]
    sb_v = [sum([times_matrix[i, j] for i in validation_idxs]) for j in range(len(combinations))]
    sb_te = [sum([times_matrix[i,j] for i in test_idxs]) for j in range(len(combinations))]

    sb_train = min(sb_t)
    sb_validation = min(sb_v)
    sb_test = min(sb_te)


    #generating a list of hyperprameters to find the best randomForest 
    paramters_range = [
        {"criterion":["gini", "entropy", "log_loss"], 
         "max_depth":[10,20,30, 40], 
         "min_samples_split": [2, 4, 8, 16], 
         "max_features": ["sqrt", "log2", None],
         "bootstrap": [True, False]}]
    parameters = list(ParameterGrid(paramters_range))
    val_scores = []
    train_scores = []
    e = lambda x: np.argmax(x)
    #training a randomForest for each hyperparameter set
    for i in range(len(parameters)):
        param = parameters[i]
        forest = RandomForestClassifier(**param,random_state=random_state)
        forest.fit(X_train, y_train)
        y_pred_val = forest.predict(X_validation)
        y_pred_train = forest.predict(X_train)
        val_score = sum([times_matrix[idx, e(y_pred_val[i])] for idx in validation_idxs])
        train_score = sum([times_matrix[idx, e(y_pred_train[i])] for idx in train_idxs])
        val_scores.append(np.mean(val_score))
        train_scores.append(np.mean(train_score))

    #taking the model with lowest validation time
    best_tree_val = RandomForestClassifier(**parameters[np.argmin(val_scores)], random_state=random_state)
    best_tree_val.fit(X_train, y_train)
    y_test_predicted = best_tree_val.predict(X_test)
    y_train_predicted = best_tree_val.predict(X_train)
    y_val_predicted = best_tree_val.predict(X_validation)

    #ŧesting on the different datasets
    times_train = [times_matrix[train_idxs[i], e(y_train_predicted[i])] for i in range(len(y_train))]
    times_validation = [times_matrix[train_idxs[i], e(y_val_predicted[i])] for i in range(len(y_validation))]
    times_test = [times_matrix[train_idxs[i], e(y_test_predicted[i])] for i in range(len(y_test))]
    
    #saving the outcome
    stats.append({
        "train_time": round(sum(times_train), 2),
        "vb_train": round(min_train, 2),
        "sb_min_time_train": round(sb_train, 2),
        "sb_max_wins_train":round(majority_train, 2),
        "validation_time": round(sum(times_validation), 2),
        "vb_val": round(min_val, 2),
        "sb_min_time_val": round(sb_validation, 2),
        "sb_max_wins_val":round(majority_val, 2),
        "test_time": round(sum(times_test), 2),
        "vb_test": round(min_test, 2),
        "sb_min_time_test": round(sb_test, 2),
        "sb_max_wins_test": round(majority_test, 2),
        "train_accuracy": round(accuracy_score(y_train, y_train_predicted) * 100, 2),
        "validation_accuracy": round(accuracy_score(y_validation, y_val_predicted) * 100, 2),
        "test_accuracy": round(accuracy_score(y_test, y_test_predicted) * 100, 2),
        "best_val_params": parameters[np.argmin(val_scores)],
        "best_train_params": parameters[np.argmin(train_scores)]
    })

training on different dataset splits: 100%|██████████| 10/10 [16:55:16<00:00, 6091.65s/it] 


In [23]:
pd.DataFrame(stats)

Unnamed: 0,train_time,vb_train,sb_min_time_train,sb_max_wins_train,validation_time,vb_val,sb_min_time_val,sb_max_wins_val,test_time,vb_test,sb_min_time_test,sb_max_wins_test,train_accuracy,validation_accuracy,test_accuracy,best_val_params,best_train_params
0,1289237.24,87870.97,5884935.9,8065799.59,1534489.4,9948.37,532968.02,883958.0,1179451.12,11133.69,872317.0,1318486.28,72.45,23.21,22.04,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': True, 'criterion': 'gini', 'max_..."
1,1686435.71,88037.02,5945863.14,8350954.45,1711143.97,9948.37,532968.02,883958.0,1551857.26,10967.64,719066.84,1033331.42,71.06,22.99,20.76,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
2,1656524.17,88302.17,5956449.56,8302512.98,1607786.56,9948.37,532968.02,883958.0,1644413.96,10702.49,800803.34,1081772.89,71.02,23.32,21.35,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': True, 'criterion': 'log_loss', '..."
3,1477086.97,87887.3,6036658.51,8460139.78,1536784.53,9948.37,532968.02,883958.0,1792708.93,11117.36,720594.39,924146.09,72.53,23.43,24.19,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
4,1465896.24,87948.35,5926275.11,8205605.44,1428338.07,9948.37,532968.02,883958.0,1563398.72,11056.31,738913.26,1178680.43,73.51,23.21,21.35,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
5,1615021.94,88079.73,6102529.35,8208551.0,1738137.45,9948.37,532968.02,883958.0,1359729.08,10924.93,654723.55,1175734.87,72.77,21.48,22.04,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
6,1534642.37,88107.62,6034561.82,8534636.19,1446471.97,9948.37,532968.02,883958.0,1579382.85,10897.04,722691.08,849649.68,71.24,21.26,21.45,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
7,1219820.16,88196.11,6035095.8,8496185.01,1476378.03,9948.37,532968.02,883958.0,1307998.81,10808.55,722157.1,888100.86,72.55,23.43,22.53,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
8,1724616.79,88665.98,6140539.05,8527943.28,1481868.67,9948.37,532968.02,883958.0,1803897.14,10338.68,616713.85,856342.59,70.67,21.37,22.62,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."
9,4435197.09,88758.17,6213070.61,8495033.36,1339489.7,9214.72,539347.13,811478.74,1260435.78,10980.14,600849.68,961731.77,22.22,9.98,9.4,"{'bootstrap': True, 'criterion': 'gini', 'max_...","{'bootstrap': False, 'criterion': 'entropy', '..."


# Final analysis

In [24]:
for idx, tree_stats in enumerate(stats):
    print(f"""The best random forest for fold {idx} minimising validation time has stats:
    train time / single best min time: {tree_stats["train_time"] / tree_stats["sb_min_time_train"]}
    train time / single best max wins: {tree_stats["train_time"] / tree_stats["sb_max_wins_train"]}
    validation time / single best min time: {tree_stats["validation_time"] / tree_stats["sb_min_time_val"]}
    validation time / single best max wins: {tree_stats["validation_time"] / tree_stats["sb_max_wins_val"]}
    test time / single best min time: {tree_stats["test_time"] / tree_stats["sb_min_time_test"]}
    test time / single best max wins: {tree_stats["test_time"] / tree_stats["sb_max_wins_test"]}
""")
    print("="*100)

The best random forest for fold 0 minimising validation time has stats:
    train time / single best min time: 0.2190741346902351
    train time / single best max wins: 0.1598399793615502
    validation time / single best min time: 2.8791397277457658
    validation time / single best max wins: 1.7359302138789399
    test time / single best min time: 1.3520900314908457
    test time / single best max wins: 0.8945494070670194

The best random forest for fold 1 minimising validation time has stats:
    train time / single best min time: 0.28363177393955286
    train time / single best max wins: 0.20194526507086863
    validation time / single best min time: 3.2105940802977257
    validation time / single best max wins: 1.935775195201582
    test time / single best min time: 2.158154393547059
    test time / single best max wins: 1.501800129139594

The best random forest for fold 2 minimising validation time has stats:
    train time / single best min time: 0.2781059678779518
    train tim