# Prediction Demo

In the following, we predict the benchmark instance labels provided by GBD's meta.db from instance feature records provided by GBD's base.db.


In [3]:
from gbd_core.api import GBD
import pandas as pd

def get_available_features():
    with GBD([ 'data/base.db' ]) as gbd:
        return gbd.get_features('base')

def get_prediction_dataset(features, target):
    with GBD([ 'data/base.db', 'data/meta.db' ]) as gbd:
        df = gbd.query('base_features_runtime != memout', resolve=features+[target])
        df[features] = df[features].apply(pd.to_numeric)
        return df
    
print(get_available_features())

['base_features_runtime', 'clauses', 'variables', 'cls1', 'cls2', 'cls3', 'cls4', 'cls5', 'cls6', 'cls7', 'cls8', 'cls9', 'cls10p', 'horn', 'invhorn', 'positive', 'negative', 'hornvars_mean', 'hornvars_variance', 'hornvars_min', 'hornvars_max', 'hornvars_entropy', 'invhornvars_mean', 'invhornvars_variance', 'invhornvars_min', 'invhornvars_max', 'invhornvars_entropy', 'balancecls_mean', 'balancecls_variance', 'balancecls_min', 'balancecls_max', 'balancecls_entropy', 'balancevars_mean', 'balancevars_variance', 'balancevars_min', 'balancevars_max', 'balancevars_entropy', 'vcg_vdegree_mean', 'vcg_vdegree_variance', 'vcg_vdegree_min', 'vcg_vdegree_max', 'vcg_vdegree_entropy', 'vcg_cdegree_mean', 'vcg_cdegree_variance', 'vcg_cdegree_min', 'vcg_cdegree_max', 'vcg_cdegree_entropy', 'vg_degree_mean', 'vg_degree_variance', 'vg_degree_min', 'vg_degree_max', 'vg_degree_entropy', 'cg_degree_mean', 'cg_degree_variance', 'cg_degree_min', 'cg_degree_max', 'cg_degree_entropy']


### Category Prediction

Train instance category predictor once and report its accuracy.

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

feat = get_available_features()
data = get_prediction_dataset(feat, 'family')

X_train, X_test, y_train, y_test = train_test_split(data[feat], data['family'], test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9690824468085106


### Feature Importance

Extract feature importance from category predictor.

In [10]:
feature_importances = model.feature_importances_
sorted_features = sorted(zip(feat, feature_importances), key=lambda x: x[1], reverse=True)
print("Features sorted by importance:")
for feature_name, importance in sorted_features:
    if importance > 0.01:
        print("{} {:.2f}".format(feature_name, importance))


Features sorted by importance:
hornvars_mean 0.05
vcg_cdegree_mean 0.05
hornvars_variance 0.04
balancevars_mean 0.04
balancecls_variance 0.04
balancecls_mean 0.04
vcg_cdegree_variance 0.04
hornvars_entropy 0.03
hornvars_max 0.03
vg_degree_mean 0.03
vcg_cdegree_entropy 0.03
vcg_vdegree_mean 0.03
cg_degree_min 0.03
vcg_vdegree_entropy 0.03
balancevars_entropy 0.03
balancevars_variance 0.02
vg_degree_entropy 0.02
invhornvars_mean 0.02
cls2 0.02
balancecls_entropy 0.02
cls1 0.02
vcg_cdegree_max 0.02
balancecls_max 0.02
negative 0.02
cg_degree_mean 0.02
cg_degree_entropy 0.01
positive 0.01
vg_degree_variance 0.01
vcg_vdegree_variance 0.01
balancevars_max 0.01
cg_degree_variance 0.01
invhornvars_max 0.01
balancevars_min 0.01
invhorn 0.01
invhornvars_variance 0.01
cls3 0.01
cls4 0.01
invhornvars_entropy 0.01
horn 0.01
variables 0.01
vcg_cdegree_min 0.01
cls8 0.01
cls10p 0.01
cg_degree_max 0.01


### Subsets

In [11]:
# grouped subsets of the features available in the base dataset
subsets = {
    'clauseType': ['clauses', 'variables', 'cls1', 'cls2', 'cls3', 'cls4', 'cls5', 'cls6', 'cls7',  'cls8', 'cls9', 'cls10p', 'horn', 'invhorn', 'positive', 'negative'],
    'hornVarDist': ['hornvars_mean', 'hornvars_variance', 'hornvars_min', 'hornvars_max', 'hornvars_entropy', 'invhornvars_mean', 'invhornvars_variance', 'invhornvars_min', 'invhornvars_max', 'invhornvars_entropy'],
    'posNegBalanceDist': ['balancecls_mean', 'balancecls_variance', 'balancecls_min', 'balancecls_max', 'balancecls_entropy', 'balancevars_mean', 'balancevars_variance', 'balancevars_min', 'balancevars_max', 'balancevars_entropy'],
    'vcgDist': ['vcg_vdegree_mean', 'vcg_vdegree_variance', 'vcg_vdegree_min', 'vcg_vdegree_max', 'vcg_vdegree_entropy', 'vcg_cdegree_mean', 'vcg_cdegree_variance', 'vcg_cdegree_min', 'vcg_cdegree_max', 'vcg_cdegree_entropy'],
    'vgcgDist': ['vg_degree_mean', 'vg_degree_variance', 'vg_degree_min', 'vg_degree_max', 'vg_degree_entropy', 'cg_degree_mean', 'cg_degree_variance', 'cg_degree_min', 'cg_degree_max', 'cg_degree_entropy']
}

# Train and evaluate model for each feature list
for name, list in subsets.items():
    X_train_feature = X_train[list]
    X_test_feature = X_test[list]
    model_feature = RandomForestClassifier()
    model_feature.fit(X_train_feature, y_train)
    y_pred_feature = model_feature.predict(X_test_feature)
    accuracy_feature = accuracy_score(y_test, y_pred_feature)
    print(f"Accuracy with {name} features:", accuracy_feature)
    feature_importances = model_feature.feature_importances_
    sorted_features = sorted(zip(list, feature_importances), key=lambda x: x[1], reverse=True)
    print("Features sorted by importance:")
    for feature_name, importance in sorted_features:
        print("{} {:.2f}".format(feature_name, importance))

KeyboardInterrupt: 