In [6]:
from decorify import time_limiter
from openml import tasks, runs
from sklearn.metrics import accuracy_score

In [None]:
missing_sets = ['analcatdata_asbestos', 'analcatdata_boxing1', 'analcatdata_broadwaymult', 'analcatdata_germangss', 'analcatdata_lawsuit', 'ar4', 'autos', 'baseball', 'bodyfat', 'braziltourism', 'chatfield_4', 'chscase_vine1', 'cloud', 'diabetes', 'diggle_table_a2', 'disclosure_z', 'elusage', 'fri_c0_250_5', 'kc3', 'kidney', 'labor', 'lowbwt', 'lupus',
                'meta', 'mfeat-karhunen', 'mfeat-morphological', 'newton_hema', 'no2', 'plasma_retinol', 'pm10', 'prnn_synth', 'rabe_131', 'rmftsa_sleepdata', 'schizo', 'schlvote', 'sleuth_case2002', 'socmob', 'solar-flare', 'squash-stored', 'squash-unstored', 'tae', 'teachingAssistant', 'transplant', 'triazines', 'veteran', 'visualizing_livestock', 'vote', 'white-clover']

In [None]:
missing_ids = [
    3550, 3540, 3824, 3887, 3542, 3911, 9, 2077, 3644, 2078, 3685, 3680, 3753, 37, 3683, 3794, 3655, 3642, 3915, 3808, 4, 3804, 3562, 3623, 16, 18, 3649, 3749, 3778, 3616, 3555, 3788, 3607, 3557, 3713, 3765, 3797, 2068, 3835, 3848, 47, 3949, 3748, 3653, 3585, 3731, 55, 3872
]

In [None]:
import json
with open("../turbo_ml/meta_learning/meta_model/algorthm_families.json", "r") as f:
    algorithm_families = json.load(f)
inv_map = {}
families = algorithm_families.keys()
for k, v in algorithm_families.items():
    for i in v:
        inv_map[i] = k
families

In [None]:
scores = {name: {family: [] for family in families} for name in missing_sets}

In [None]:
unrececognized_flows = []

In [None]:
import random
import re
for id, name in zip(missing_ids, missing_sets):
    print(name)
    runs_df = runs.list_runs(task=[id], output_format='dataframe')
    run_ids = runs_df['run_id'].tolist()
    random.shuffle(run_ids)
    for run_id in run_ids[:100]:
        run = runs.get_run(run_id)
        desc = run.__str__()
        try:
            metric = re.search(
                r'Metric.{10}: ([^\s]+)', desc).group(0).split(': ')[-1]
            flow_name = re.search(
                r'Flow Name[.]+: ([^\s]+)', desc).group(0).split(': ')[-1].split('(')[0]
            result = re.search(
                r'Result.{10}: 0.[0-9]+', desc).group(0).split(': ')[-1]
        except AttributeError:
            continue
        if metric != 'predictive_accuracy':
            continue
        if flow_name[:5] == 'weka.':
            flow_name = ('_').join([flow_name[5:], 'w'])

        if flow_name in inv_map:
            family = inv_map[flow_name]
        else:
            if "bagging" in flow_name.lower():
                family = "Bagging_(BAG)"
            elif "boost" in flow_name.lower():
                family = "Boosting_(BST)"
            elif "bayes" in flow_name.lower():
                family = "Bayesian_Methods_(BY)"
            elif "nn" in flow_name.lower():
                family = "Neural_Networks_(NNET)"
            elif "svm" in flow_name.lower():
                family = "Support_Vector_Machines_(SVM)"
            elif "logistic" in flow_name.lower():
                family = "Logistic_and_Multinomial_Regression_(LMR)"
            elif "forest" in flow_name.lower():
                family = "Random_Forests_(RF)"
            else:
                unrececognized_flows.append(flow_name)
                continue
        # print(family, flow_name, result)
        scores[name][family].append(float(result))
    for family in families:
        if len(scores[name][family]) == 0:
            scores[name][family] = 0
        else:
            scores[name][family] = max(scores[name][family])

In [None]:
scores

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(scores)

df = df.T
df.reset_index(inplace=True)
df.rename(columns={'index': 'name'}, inplace=True)
df.set_index('name', inplace=True)
df.to_csv("../data/missing_family_scores.csv")

In [None]:
list(algorithm_families.keys())

In [None]:
import google.generativeai as genai
genai.configure(api_key="YOUR-API-KEY")

prompt = "What is the algorithm family of the algorithm?"
response_schema = {
     "type": "STRING",
     "enum": ['Discriminant Analysis',
               'Bayesian Methods',
               'Neural Networks',
               'SVM',
               'Decision Trees',
               'Rule-Based Methods',
               'Boosting',
               'Bagging',
               'Stacking',
               'Random Forests',
               'Other Ensembles',
               'Generalized Linear Models',
               'Nearest Neighbor Methods',
               'Partial Least Squares and Principal Component Regression',
               'Logistic and Multinomial Regression',
               'Multivariate Adaptive Regression Splines',
               'Other Methods'
          ],
}
model = genai.GenerativeModel(
    system_instruction=prompt,
    generation_config=genai.GenerationConfig(
        response_mime_type="text/x.enum", response_schema=response_schema
    ),
)

@time_limiter(60, 14) # 15 requests per minute (I left one out as buffer)
def classify_algorithm(name):
     response = model.generate_content(name)
     print(response.usage_metadata.total_token_count)
     return response.text

mapping = {}
for name in set(unrececognized_flows):
     mapping[name] = classify_algorithm(name)

In [None]:
with open("classification.json", "w") as f:
    json.dump(mapping, f)