In [60]:
#All imports
import pandas as pd
import numpy as np
import json
import re
import nltk
import wordninja
from typing import List, Tuple, Dict
from sklearn.metrics import classification_report
from nltk.stem.wordnet import WordNetLemmatizer
import random
from openai import OpenAI
import time
import copy
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import pFAHES.common as common
import pFAHES.patterns as patterns
import pFAHES.DV_Detector as DV_Detector
import pFAHES.RandDMVD as RandDMVD
import pFAHES.OD as OD
from statistics import mean, median, mode
import copy
from sortinghatinf import get_sortinghat_types
from numpy import percentile
import tiktoken
from deepchecks.tabular.checks import MixedDataTypes
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import StringMismatch
from PyNomaly import loop
from nameguess.metric import BNGMetrics 
import pickle
from evaluate import load
import spacy
import string

## Classes and helper functions

In [2]:
class CrypticNameGenerator:
    def __init__(self, per_tok_target_len, lookup_abbreviation,
                 p_filter_acronym, lookup_acronym,
                 pr_keep_k, pr_remove_vowels, pr_logic,
                 pm_as_is, pm_lookup, pm_selected_rule):
        """_summary_
        Class for automatic cryptic name generation from table column headers

        Args:
            per_tok_target_len (int): the target length when abbreviating each token through rules
            lookup_abbreviation (dict): a lookup tables containing (expansion, abbreviation) pairs
            lookup_acronym (dict): a lookup tables containing (expansion, acronym) pairs
            p_filter_acronym (float): probability of filtering and replacing the subsequence by an acronym from the acronym lookup dictionary.
            pr_keep_k (float): for rules, the probability of choosing rule 1: keep the first k characters
            pr_remove_vowels (float): for rules, the probability of choosing rule 2: remove all non-leading vowels
            pr_logic (float): for rules, the probability of choosing rule 3: logic from https://docs.tibco.com/pub/enterprise-runtime-for-R/4.1.1/doc/html/Language_Reference/base/abbreviate.html
            pm_as_is (float): for token-level methods, the probability of choosing token-level method 1: keep the token as-is
            pm_lookup (float): for token-level methods, the probability of choosing token-level method 2: generate abbreviation through lookup table
            pm_selected_rule (float): or token-level methods, the probability of choosing token-level method 3: use rules selected from (pr_keep_k, pr_remove_vowels, pr_logic)
        """
        self.per_tok_target_len = per_tok_target_len
        self.lookup_abbreviation = lookup_abbreviation
        self.lookup_acronym = lookup_acronym
        self.p_filter_acronym = p_filter_acronym
        self.pr_keep_k = pr_keep_k
        self.pr_remove_vowels = pr_remove_vowels
        self.pr_logic = pr_logic
        self.pm_as_is = pm_as_is
        self.pm_lookup = pm_lookup
        self.pm_selected_rule = pm_selected_rule
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.stemmer = nltk.stem.PorterStemmer()
        
    def rule_keep_k(self, query: str) -> str:
        """
        Rule 1: Keep first k characters in a word
        """
        return query[:self.per_tok_target_len] if len(query) > self.per_tok_target_len else query

    def rule_remove_vowels(self, query: str) -> str:
        """
        Rule 2: Keep removing non-leading vowels until the threshold or all non-leading vowels have been removed
        """
        start, elems = query[0], list(query)[1:]
        
        vow_idx = [i for i, val in enumerate(elems) if val in ('a', 'e', 'i', 'o', 'u')]
        counter_vow = len(vow_idx)
        counter_truncate = len(query)
        if len(query) > self.per_tok_target_len and vow_idx: 
            while counter_truncate >= self.per_tok_target_len and counter_vow > 0:
                elems[vow_idx[counter_vow-1]] = ""
                counter_vow -= 1
                counter_truncate -= 1

        return start + "".join(elems)

    def rule_logic(self, query: str) -> str:
        """ 
        Rule 3:
        Code contributed by Nicholas Hespe @nahespe

        The abbreviation algorithm does not simply truncate. 
        It has a threshold, according to which it will drop, in order:

            1. duplicate values next to eachother
            2. lower case vowels.
            3. lower case consonants and punctuation.
            4. upper case letters and special characters.  
        
        exits if target_len <= 2
        
        """
        start, elems = query[0], list(query)[1:]
        
        ## exit early if not valid
        if len(elems) < self.per_tok_target_len: 
            return start + "".join(elems)
        
        counter = len(elems)
        while counter >= self.per_tok_target_len:
            counter -= 1
            
            ## remove duplicates next to eachother
            candidates = [i for i in range(len(elems[:-1])) if (elems[i] and elems[i]==elems[i+1])]
            if candidates:
                choice = random.choice(candidates)
                elems[choice] = ""
                continue
                
            ## search for vowels and remove right to left
            candidates = [i for i, val in enumerate(elems) if val in ('a', 'e', 'i', 'o', 'u')]
            if candidates:
                choice = random.choice(candidates)
                elems[choice] = ""
                continue
            
            ## Search for  lower case consonants and remove randomly
            candidates = [i for i, val in enumerate(elems) if (val and not val in ('a', 'e', 'i', 'o', 'u'))]
            if candidates:
                choice = random.choice(candidates)
                elems[choice] = ""
            
        return start + "".join(elems)

    def select_from_probs(self, probs: list, epsilon: float=1e-8) -> int:
        """
        Make random selection based on the probabilities of each index
        """
        assert abs(np.sum(probs) - 1) < epsilon, 'Sampling probabilities must add up tp 1.'
       
        rand = random.uniform(0, 1)

        def cum_sum(l):
            sum = 0
            new_l = [0]
            for ele in l:
                sum += ele
                new_l.append(sum) 
            return new_l

        probs_cum = cum_sum(probs)
        for i, this_level in enumerate(probs_cum[:-1]):
            next_level = probs_cum[i + 1]
            if this_level <= rand < next_level:
                return i
            else:
                pass

    def tokenize(self, text: str, 
                   keep_punc: bool=True, 
                   keep_stopwords: bool=True,
                   split_camelcase: bool=True,
                   use_stem: bool=False) -> list:
        """_summary_
        Split the text into words and punctuations

        Args:
            text (str): input string
            keep_punc (bool, optional): whether to keep non-alphanumeric symbols. Defaults to True.
            keep_stopwords (bool, optional): whether to keep stop words. Defaults to False.
            split_camelcase (bool, optional): whether to split camelCased words (i.e. "camelCase" -> "camel Case"). Defaults to True.
            use_stem (bool, optional): whether to use stemmer
        Returns:
            list: a list of tokens
        """
        def split_with_punc(text: str) -> list:
            return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

        def separate_camel_case(text: str) -> list:
            return re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text))

        text = text.replace('_', ' ')
        if split_camelcase:
            text = separate_camel_case(text)
        if keep_punc:
            res = split_with_punc(text)
        else:
            res = text.split()
        if not keep_stopwords:
            res = [ele for ele in res if ele not in self.stopwords]
        
        ## Each tokenized words are stemmed
        return [self.stemmer.stem(ele) if use_stem else ele for ele in res]

    ## Methods
    def select_rule(self, query: str) -> str:
        """
        Method 3: Randomly select a rule from all the pre-defined rules and apply on the string
        """
        ## Rule not applied on numericals
        if query.isdigit():
            return query 

        rule_choices = [(self.pr_keep_k, self.rule_keep_k), 
                        (self.pr_remove_vowels, self.rule_remove_vowels), 
                        (self.pr_logic, self.rule_logic)]
        ## Probabilities of choosing each of the rule when the method seleted is rule-based.
        rule_probs = [choice[0] for choice in rule_choices]
        selected_rule_idx = self.select_from_probs(rule_probs)
        selected_rule = rule_choices[selected_rule_idx][-1]
        if len(query) > 10:
            orig_thres = self.per_tok_target_len
            self.per_tok_target_len = len(query) // 2
            res = selected_rule(query)
            self.per_tok_target_len = orig_thres
        else:
            res = selected_rule(query)
        return res

    def as_is(self, query: str) -> str:
        """
        Method 1: Keep the word as is.s
        """
        return query
    
    def lookup(self, query: str) -> str: 
        """
        Method 2: Find corresponding abbreviation from a lookup table
        """
        ## TODO if returns multiple values, current solution is to randomly pick one, but need to later figure out a soln to cache one value for future use in the same table, or some similar tables
        if query in self.lookup_abbreviation:
            values_raw = self.lookup_abbreviation[query]
            if values_raw is not None:
                weights = [ele["upvotes"] for ele in values_raw.values()]
                if sum(weights) > 0:
                    abbrev = random.choices(list(values_raw.keys()),
                        weights=weights, k=1)[0]
                    return abbrev
                
        return self.select_rule(query)

    def select_method(self, query: str) -> str:
        """
        Select one of the token-level processing method
        """
        method_choices = [(self.pm_as_is, self.as_is), 
                          (self.pm_lookup, self.lookup), 
                          (self.pm_selected_rule, self.select_rule)]
        method_probs = [choice[0] for choice in method_choices]
        selected_method_idx = self.select_from_probs(method_probs)
        selected_method = method_choices[selected_method_idx][-1]
        return selected_method(query)

    def combine(self, toks: list, p_camel=.333, p_underscore=.333) -> str:
        """
        Combine the abbreviated tokens into the cryptic name by either camelCase or underscore_name
        """
        def preprocess(toks: list) -> list:
            new_toks = []
            for tok in toks:
                if isinstance(tok, list):
                    new_toks.extend(tok)
                else:
                    new_toks.append(tok)
            return new_toks

        def combine_underscore(toks: list) -> str:
            res = ""
            for i, tok in enumerate(toks):
                if tok.isalnum() and i < len(toks) - 1:
                    res += tok
                    if toks[i+1].isalnum():
                        res += "_"
                else:
                    res += tok
            return res
        
        def combine_camel(toks: list) -> str:
            if len(toks) > 1:
                camel_case = "".join([toks[0]] + [tok[0].upper() + tok[1:] if len(tok) > 1 else tok.upper() for tok in toks[1:]])
                return camel_case
            else:
                return "".join(toks)

        def combine_simple(toks: list) -> str: 
            return "".join(toks)

        toks = preprocess(toks)
        rand = random.uniform(0, 1)
        if 0 < rand < p_camel:
            return combine_camel(toks)
        elif p_camel <= rand < p_camel + p_underscore:
            return combine_underscore(toks)
        else:
            return combine_simple(toks)
    
    def span2plus(self, lst):
        res = []
        for i in range(2, len(lst) + 1):
            for t in range(len(lst) - i + 1):
                res.append((lst[t:t+i], t, t+i))
        return res

    def filter_acronyms(self, words, lookup):
    
        combs = self.span2plus(words)
        for comb, l_end, r_start in combs:
            comb_string = " ".join(comb)
            if comb_string in lookup:
                acronym_cands = lookup[comb_string]
                weights = [ele["upvotes"] for ele in acronym_cands.values()]
                if sum(weights) > 0:
                    acronym = random.choices(list(acronym_cands.keys()), weights=weights, k=1)[0]
                    left, right = words[:l_end], words[r_start:]
                    return [acronym], l_end, r_start
        return [], -1, -1

    def generate(self, text: str) -> str:
        """
        Generate cryptic name from column header
        """
        toks = self.tokenize(text)
        if len(toks) < 10:
            # The time complexity for acronym matching is O(N(N-1)/2) ~ O(N^2), where N is the number of tokens in the column header.
            # It is possible to encounter very long headers like a small paragraph and we should avoid matching acronyms for very long headers.
            # Threshold set to 10 tokens.
            acronym, acronym_start_idx, acronym_end_idx = self.filter_acronyms([tok.lower() for tok in toks], self.lookup_acronym)
            rand = random.uniform(0, 1)

            ## Case where there exist matching span(s) from the acronym lookup dictionary and generator selected to replace acronyms
            if acronym_start_idx >= 0 and rand < self.p_filter_acronym:
                left = [self.select_method(tok) for tok in toks[:acronym_start_idx]]
                right = [self.select_method(tok) for tok in toks[acronym_end_idx:]]
                return left + acronym + right

        return [self.select_method(tok) for tok in toks]

In [5]:
class CrypticIdentifier:
    """Module to identify any cryptic forms in a column header.
    Example usage: 
        identifier = CrypticIdentifier(vocab_file)
        identifier.iscryptic("newyorkcitytotalpopulation") --> False
        identifier.iscryptic("tot_revq4") --> True
    """
    def __init__(self, vocab_file=None, word_rank_file=None, k_whole=4, k_split=2):
        """
        Args:
            vocab_file (str, optional): json file containing the vocabulary. Defaults to None.
            k_whole (int, optional): length threshold for a whole string to be considered non-cryptic if it fails the first round of check (i.e.
            _iscryptic returns True). Defaults to 4.
            k_split (int, optional): length threshold for each word split (wordninja.split()) from the string to be considered non-cryptic, if the pre-split string fails the first round of check (i.e.
            _iscryptic returns True). Defaults to 2.
        """
        if vocab_file is not None:
            with open(vocab_file, "r") as fi:
                self.vocab = json.load(fi)
#                 print("#vocab={}".format(len(self.vocab)))
        else:
            self.vocab = None

        self.k_whole = k_whole
        self.k_split = k_split
        if word_rank_file is None:
            self.splitter = wordninja
        else:
            self.splitter = wordninja.LanguageModel(word_rank_file)
        self.lem = WordNetLemmatizer()
        

    def split_rm_punc(self, text: str) -> list:
        return re.sub(r'[^\w\s]', ' ', text).split()

    def separate_camel_case(self, text: str) -> list:
        return re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text))

    def convert2base(self, text: str) -> str:
        return self.lem.lemmatize(text)

    def _split(self, text: str) -> list:
        text = text.replace('_', ' ')
        words = self.split_rm_punc(self.separate_camel_case(text))
        return words

    def _iscryptic(self, text: str) -> bool:
        words = self._split(text)
        if all([word.isnumeric() for word in words]):
            return True
        if self.vocab is None:
            self.vocab = nltk.corpus.wordnet.words('english')
        return any([self.convert2base(w.lower()) not in self.vocab for w in words])



    def doublecheck_cryptic(self, text: str) -> Tuple[bool, List[str]]:
        """Double-check whether a column header contains cryptic terms. For example in some cases where neither 
        delimiters between tokens nor camelcases is available

        Args:
            text (str): column header

        Returns:
            Tuple[
                    bool: whether header is cryptic
                    List[str]: splitted tokens from the header
                ]
        """

        #stopwords = nltk.corpus.stopwords.words('english')

        def split_check(words: List[str]) -> Tuple[bool, List[str]]:
            l_cryptic = []
            for ele in words:
                if ele.isdigit():
                    l_cryptic.append(False)
                ## Cornercases includes stopwords like "I", "for", etc.
                elif len(ele) < self.k_split: # and ele.lower() not in stopwords:
                    l_cryptic.append(True)
                ## Second round check
                else:
                    l_cryptic.append(self._iscryptic(ele))
            return any(l_cryptic), words
            
        if len(text) >= self.k_whole:
            if self._iscryptic(text):
                split = self.splitter.split(text)
                return split_check(split)            
            else:
                # return (False, self.splitter.split(text))
                return (False, self._split(text))
        else:
            return (True, [text])

    def iscryptic(self, text: str) -> bool:
        return self.doublecheck_cryptic(text)[0]
    
    def split_results(self, text: str) -> List[str]:
        return self.doublecheck_cryptic(text)[1]

In [16]:
def query_cryp_cols(cryp_list, title, description, content_df):
    ''' Function to generate the query for a dataset based on the parameters passed'''
    col_query = " | ".join(str(item) for item in cryp_list)
    
    if not content_df.empty:
        num_instances = len(content_df)
        content_list = [content_df.loc[idx].to_list() for idx in content_df.index]
        contents = "\n".join([" | ".join(str(item) for item in lst) for lst in content_list])
        content_bool = True
    else:
        content_bool = False
    
    if title == False:
        title_query = ""
    else:
        title_query = f"with title: {title} "
    if description == False:
        desc_query = ""
    else:
        desc_query = f"with description: {description} "
    if content_bool == False:
        content_query = ""
    else:
        content_query = f"with contents of {num_instances} random instances:\n{contents}"
    
    if title == False and description == False and content_bool == False:
        subquery = ""
    else:
        subquery=f"""
{title_query}
{desc_query}
{content_query}
        """
    
    query = f"""Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset{subquery},
the abbreviated column names: {col_query} stand for"""
    print(query)
    return query

## Correct

In [17]:
def correct_cryp_cols(df, title=False, description=False, n_instances=0):
    ''' Correction function that calls GPT3.5 to suggest better names for column names (as used in the tool)'''
    # Set API key, uncomment to test
    client = OpenAI(api_key="") # place your api key here
    
    # Identify the cryptic column names
    cryptic_cols = detect_cryptic(df)
    num_cols = len(cryptic_cols)
    token_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    if n_instances == 0:
        instances = pd.DataFrame()
    else:
        # Obtain n random instances of content for the cryptic columns
        sliced_df = df[cryptic_cols]
        indices = random.sample(range(0, len(sliced_df)), n_instances)
        instances = sliced_df.iloc[indices]
    
    # Generate the OpenAI query
    query = query_cryp_cols(cryptic_cols, title=title, description=description, content_df=instances)
    tokens_used = len(token_encoding.encode(query))
    print(tokens_used)

    # Call OpenAI's GPT 3.5 Turbo LLM model to generate better column names
    completion = client.chat.completions.create(model="gpt-3.5-turbo",temperature=0.0,messages=[{"role": "user", "content": query}])
    message = completion.choices[0].message
    new_col_names = message.content
    return new_col_names, tokens_used

## Generate Cryptic Names

In [8]:
def generate_cryptic(non_cryptic_list):
    ''' Function to generate cryptic names for non-cryptic column names.'''
    cryptic_list = []
    not_same = True
    with open("./lookups/cryptifier_config.json", "r") as fi:
        params = json.load(fi)
    generator = CrypticNameGenerator(lookup_abbreviation="./lookups/abbreviation_samples.json", lookup_acronym="./lookups/acronym_samples.json", **params)
    for non_cryptic_name in non_cryptic_list:
        while not_same:
            cryptic_name = generator.combine(generator.generate(non_cryptic_name))
            if cryptic_name != non_cryptic_name:
                cryptic_list.append(cryptic_name)
                not_same = False
        not_same = True
    return cryptic_list

## Detect Cryptic Names

In [9]:
def detect_cryptic(df):
    ''' Function to detect cryptic attributes in a dataset.'''
    identifier = CrypticIdentifier("./lookups/wordnet.json", "./lookups/wordninja_words_alpha.txt.gz")
    cryptic_cols = [col for col in df.columns if identifier.doublecheck_cryptic(col)[0]==True or len(col) < 5]
    return cryptic_cols

## Detect Non-Cryptic Names

In [10]:
def detect_non_cryptic(df):
    ''' Function to detect non-cryptic attributes in a dataset'''
    cryptic = detect_cryptic(df)
    non_cryptic = list(set(df.columns) - set(cryptic))
    return non_cryptic

## Generate Non-Cryptic Names (GPT-3.5)

In [11]:
def replace_cols_desc(description, correct_columns, cryptic_columns):
    ''' For the queries containing the description, the ground truth column names 
    in the description have to be replaced by the cryptic names'''
    for correct, cryptic in zip(correct_columns, cryptic_columns):
        description = description.replace(correct, cryptic)
    return description

In [14]:
def generate_non_cryptic(df, non_cryp_cols, gen_cryp_cols, n_instances, title=False, description=False):
    ''' Generate non-cryptic names for the generated cryptic names using GPT-3.5'''
        # Set API key, uncomment to test
    client = OpenAI(api_key="") # place your API key here
    token_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    
    if n_instances == 0:
        instances = pd.DataFrame()
    else:
        # Obtain n random instances of content for the cryptic columns
        sliced_df = df[non_cryp_cols]
        indices = random.sample(range(0, len(sliced_df)), n_instances)
        instances = sliced_df.iloc[indices]
    
    if description:
        description = replace_cols_desc(description, non_cryp_cols, gen_cryp_cols)
    
    # Generate the OpenAI query
    query = query_cryp_cols(gen_cryp_cols, title=title, description=description, content_df=instances)
    tokens_used = len(token_encoding.encode(query))

    # Call OpenAI's GPT 3.5 Turbo LLM model to generate better column names
    completion = client.chat.completions.create(model="gpt-3.5-turbo",temperature=0.0,messages=[{"role": "user", "content": query}])
    message = completion.choices[0].message
    new_col_names_string = message.content
    new_col_names = new_col_names_string.split(' | ')
    return new_col_names, tokens_used

## Obtain the 20 Test Datasets

In [21]:
import openml
# List all datasets and their properties
df_datasets = openml.datasets.list_datasets(output_format="dataframe")

In [22]:
# Obtain 10 datasets with 10 to 20 features and 10 datasets with 20 to 30 features.
# Less than 10.000 instances and version 1 are used to filter down all the options
df_10_to_20 = df_datasets[(df_datasets['NumberOfFeatures'] >= 10) & (df_datasets['NumberOfFeatures'] <= 20) & (df_datasets['NumberOfInstances'] <= 10000) & (df_datasets['NumberOfInstances'] >= 10) & (df_datasets['version'] == 1)].dropna()
df_20_to_30 = df_datasets[(df_datasets['NumberOfFeatures'] > 20) & (df_datasets['NumberOfFeatures'] <= 30) & (df_datasets['NumberOfInstances'] <= 10000) & (df_datasets['NumberOfInstances'] >= 10) & (df_datasets['version'] == 1)].dropna()

In [24]:
df_20_to_30.head()

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
24,24,mushroom,1,1,active,ARFF,4208.0,12.0,3916.0,2.0,23.0,8124.0,2480.0,2480.0,0.0,23.0
25,25,colic,1,1,active,ARFF,232.0,63.0,136.0,2.0,27.0,368.0,361.0,1927.0,7.0,20.0
31,31,credit-g,1,1,active,ARFF,700.0,10.0,300.0,2.0,21.0,1000.0,0.0,0.0,7.0,14.0
38,38,sick,1,1,active,ARFF,3541.0,5.0,231.0,2.0,30.0,3772.0,3772.0,6064.0,7.0,23.0


In [26]:
# Random seed to reproduce the indices
# We select 15 datasets from the 10 to 20 features dataset and 5 from the 20 to 30 features dataset
random.seed(41)
idx_10_20 = random.sample(range(0, 64), 15)
idx_20_30 = random.sample(range(0, len(df_20_to_30)), 5)
idx_10_20, idx_20_30

([48, 21, 14, 10, 57, 58, 24, 36, 44, 18, 35, 17, 59, 46, 63],
 [36, 0, 15, 42, 1])

In [27]:
instances_10_20 = df_10_to_20.iloc[idx_10_20]
instances_20_30 = df_20_to_30.iloc[idx_20_30]

In [28]:
instances_10_20

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
1552,1552,autoUniv-au7-1100,1,64,active,ARFF,305.0,5.0,153.0,5.0,13.0,1100.0,0.0,0.0,8.0,5.0
375,375,JapaneseVowels,1,2,active,ARFF,1614.0,9.0,782.0,9.0,15.0,9961.0,0.0,0.0,14.0,1.0
55,55,hepatitis,1,1,active,ARFF,123.0,2.0,32.0,2.0,20.0,155.0,75.0,167.0,6.0,14.0
50,50,tic-tac-toe,1,1,active,ARFF,626.0,3.0,332.0,2.0,10.0,958.0,0.0,0.0,0.0,10.0
40690,40690,threeOf9,1,869,active,ARFF,274.0,2.0,238.0,2.0,10.0,512.0,0.0,0.0,0.0,10.0
40691,40691,wine-quality-red,1,869,active,ARFF,681.0,6.0,10.0,6.0,12.0,1599.0,0.0,0.0,11.0,1.0
465,465,analcatdata_cyyoung8092,1,2,active,ARFF,73.0,62.0,24.0,2.0,11.0,97.0,0.0,0.0,7.0,4.0
1100,1100,PopularKids,1,2,active,ARFF,247.0,9.0,90.0,3.0,11.0,478.0,0.0,0.0,6.0,5.0
1498,1498,sa-heart,1,64,active,ARFF,302.0,2.0,160.0,2.0,10.0,462.0,0.0,0.0,8.0,2.0
185,185,baseball,1,1,active,ARFF,1215.0,7.0,57.0,3.0,17.0,1340.0,20.0,20.0,15.0,2.0


In [29]:
instances_20_30

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
40649,40649,GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...,1,869,active,ARFF,800.0,3.0,800.0,2.0,21.0,1600.0,0.0,0.0,0.0,21.0
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
1062,1062,ar5,1,2,active,ARFF,28.0,2.0,8.0,2.0,30.0,36.0,0.0,0.0,29.0,1.0
40708,40708,allrep,1,869,active,ARFF,3648.0,5.0,34.0,4.0,30.0,3772.0,0.0,0.0,6.0,24.0
24,24,mushroom,1,1,active,ARFF,4208.0,12.0,3916.0,2.0,23.0,8124.0,2480.0,2480.0,0.0,23.0


In [71]:
list_dids = instances_20_30['did'].to_list() + instances_10_20['did'].to_list()
list_dids

[40649,
 9,
 1062,
 40708,
 24,
 1552,
 375,
 55,
 50,
 40690,
 40691,
 465,
 1100,
 1498,
 185,
 1057,
 171,
 40693,
 1512,
 41760]

In [72]:
# Only want datasets with at least 5 non-cryptic attributes
dataset_list = []
did_good = []
for did in list_dids:
    dataset = openml.datasets.get_dataset(did)
    df, y, _, _ = dataset.get_data(dataset_format="dataframe")
    if len(detect_non_cryptic(df)) < 6:
        continue
    else:
        did_good.append(did)
did_good


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta

[9, 1062, 40708, 24, 375, 55, 50, 40691, 1100, 185, 171, 40693, 41760]

In [73]:
# Additionally, it turned out that dataset ids 1062 and 375 gave errors, so we chose two other for them as well
did_good.remove(375)
did_good.remove(1062)
did_good

[9, 40708, 24, 55, 50, 40691, 1100, 185, 171, 40693, 41760]

## Datasets with not enough non-cryptic columns
10 to 20:374, 1552,40690,465,1498,1057,1512 = 7     
20 to 30:1062, 40649 = 2

By hand we selected those 9 datasets, to get the following list of dataset ids

In [53]:
list_dids = [9,1059,40708,24,36,55,50,40691,1100,185,171,40693,41760,40701,54,4,10,23,13,187]

# Experiment 1: example instances

## Now that we have sampled 20 datasets with at least 5 non cryptic attributes, we can start the experiment 1

In [38]:
# This code generates all predictions for the first experiment with the example instances
num = 1
pred_dct = dict()
token_length_0 = []
token_length_1 = []
token_length_3 = []
token_length_5 = []
token_length_10 = []

for did in list_dids:
    dataset = openml.datasets.get_dataset(did)
    df, _, _, _ = dataset.get_data(dataset_format="dataframe")
    # Find the non-cryptic columns and convert them to cryptic columns (so we have X and y)
    y = detect_non_cryptic(df)
    X = generate_cryptic(y)
    
    # Give the cryptic names as input to GPT combined with an n number of instances and ask
    # it to generate non-cryptic names
    gpt_predictions_0, t_0 = generate_non_cryptic(df, y, X, 0)
    gpt_predictions_1, t_1 = generate_non_cryptic(df, y, X, 1)
    gpt_predictions_3, t_3 = generate_non_cryptic(df, y, X, 3)
    gpt_predictions_5, t_5 = generate_non_cryptic(df, y, X, 5)
    gpt_predictions_10, t_10 = generate_non_cryptic(df, y, X, 10)
    
    if (len(y) != len(gpt_predictions_0)) or (len(y) != len(gpt_predictions_1)) or (len(y) != len(gpt_predictions_3)) or (len(y) != len(gpt_predictions_5)) or (len(y) != len(gpt_predictions_10)):
        print(f"Something went wrong with the predictions in dataset number {num}")
        print("The number of predictions do not align with the number of ground truth values")
        print(f"Length y: {len(y)}, y_pred_0: {len(gpt_predictions_0)}, y_pred_1: {len(gpt_predictions_1)}, y_pred_3: {len(gpt_predictions_3)}, y_pred_5: {len(gpt_predictions_5)}, y_pred_10: {len(gpt_predictions_10)}")
        print("We will not add this dataset to the predictions")
        continue
    else:
        # Store the predictions in a dataframe
        df_predictions = pd.DataFrame()
        df_predictions['y'] = y
        df_predictions['X'] = X
        df_predictions[f'y_pred_0'] = gpt_predictions_0
        df_predictions[f'y_pred_1'] = gpt_predictions_1
        df_predictions[f'y_pred_3'] = gpt_predictions_3
        df_predictions[f'y_pred_5'] = gpt_predictions_5
        df_predictions[f'y_pred_10'] = gpt_predictions_10

        # To compute the bertscore F1 in the end, we will divide by the average token length for every query
        token_length_0.append(t_0)
        token_length_1.append(t_1)
        token_length_3.append(t_3)
        token_length_5.append(t_5)
        token_length_10.append(t_10)

        # Store the predictions with the token lengths in a dictionary, which we can use to compute the scores
        token_lengths = {"token_length" : [t_0, t_1, t_3, t_5, t_10]}
        pred_dct[f"pred_df_{num}"] = [df_predictions, token_lengths]
    num += 1
pred_dct


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: ELONGA | SCLDVARIANCEMJOR | SCATTERRATIO | DISTANCECIRCULARITY | SCALRAISOFGYRATION | SCALEDVAICMINO | RADIUSRATIO | SEWEABOUTMIOR | CMACT | Clss | HOLLOWSRATIO | MAX.LETHRECTANG | CRRTY | MAX.LENGTHAPCTRATO | SKEWABOUMJOR stand for


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [187]:
# Save the predictions to a pickle file
with open('data/complete_preds_experiment_1.pkl', 'wb') as fp:
    pickle.dump(pred_dct, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [85]:
# Load the predictions in to calculate the performances
with open("data/complete_preds_experiment_1.pkl", 'rb') as fp:
    pred_dct_inst = pickle.load(fp)
pred_dct_inst

{'pred_df_1': [                    y            X           y_pred_0           y_pred_1  \
  0              length         lgth             Length             Length   
  1          wheel-base    whel-Base         Wheel-Base         Wheel-Base   
  2               width         wdth              Width              Width   
  3         engine-type    engn-type        Engine Type        Engine Type   
  4               price         pric              Price              Price   
  5          aspiration       asprtn         Aspiration         Aspiration   
  6         curb-weight    curb-wght        Curb Weight        Curb Weight   
  7        drive-wheels    drve-Whls       Drive Wheels       Drive Wheels   
  8         fuel-system   fuel-systm        Fuel System        Fuel System   
  9     engine-location    engi-loca    Engine Location    Engine Location   
  10         body-style    body-styl         Body Style         Body Style   
  11           peak-rpm     peak-Rpm           Peak

In [57]:
# We calculate the average token lengths for all query types
token_len_0 = []
token_len_1 = []
token_len_3 = []
token_len_5 = []
token_len_10 = []

for df,tokens in pred_dct_inst.values():
    token_len_0.append(tokens['token_length'][0])
    token_len_1.append(tokens['token_length'][1])
    token_len_3.append(tokens['token_length'][2])
    token_len_5.append(tokens['token_length'][3])
    token_len_10.append(tokens['token_length'][4])
avg_tokens_0 = mean(token_len_0)
avg_tokens_1 = mean(token_len_1)
avg_tokens_3 = mean(token_len_3)
avg_tokens_5 = mean(token_len_5)
avg_tokens_10 = mean(token_len_10)
avg_token = [avg_tokens_0, avg_tokens_1, avg_tokens_3, avg_tokens_5, avg_tokens_10]

In [59]:
# Load in the model to calculate the bertscore F1 scores
bertscore = load("bertscore")

In [63]:
# Functions to convert strings into base forms split in a list
# Used for calculating EM and F1 scores
spacy_nlp = spacy.load("en_core_web_sm")
def convert2base(text: str) -> str:
    return " ".join([t.lemma_ for t in spacy_nlp(text)])

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def replace_underscore_with_space(text):
        return text.replace("_", " ")

    def replace_hyphen_with_space(text):
        return text.replace("-", " ")

    def lower(text):
        return text.lower()

    res = []
    if "_" in s or "-" in s:
        new_ans1 = white_space_fix(
            remove_articles(
                remove_punc(
                    replace_hyphen_with_space(
                        replace_underscore_with_space(lower(s))
                    )
                )
            )
        )
        res.append(new_ans1)

    new_answer = white_space_fix(remove_articles(remove_punc((lower(s)))))
    res.append(new_answer)
    new_answer = convert2base(new_answer)
    if new_answer not in res:
        res.append(new_answer)
    return res

In [64]:
def f1_scores_per_type(pred_dct, num):
    ''' Function to generate the F1 scores and EM scores per query type'''
    f1_scores = []
    em_scores = []
    for key, value in pred_dct.items():
        predictions_1_truth = value[0]['y']
        predictions_1_0 = value[0][f'y_pred_{num}']
        for references, predictions in zip(predictions_1_truth, predictions_1_0):
            refs = normalize_answer(references)
            preds = normalize_answer(predictions)
            f1_sub_scores = []
            em_sub_scores = []
            print(preds, refs)
            for pred in preds:
                for ref in refs:
                    f1_sub_scores.append(bertscore.compute(predictions=[pred],references=[ref],model_type="distilbert-base-uncased")['f1'][0])
                    if pred == ref:
                        em_sub_scores.append(1)
                    else:
                        em_sub_scores.append(0)
            f1_scores.append(max(f1_sub_scores))
            em_scores.append(max(em_sub_scores))
    return f1_scores, em_scores
f1_scores_0, em_scores_0 = f1_scores_per_type(pred_dct_inst, 0)
f1_scores_1, em_scores_1 = f1_scores_per_type(pred_dct_inst, 1)
f1_scores_3, em_scores_3 = f1_scores_per_type(pred_dct_inst, 3)
f1_scores_5, em_scores_5 = f1_scores_per_type(pred_dct_inst, 5)
f1_scores_10, em_scores_10 = f1_scores_per_type(pred_dct_inst, 10)

['length'] ['length']
['wheel base', 'wheelbase'] ['wheel base', 'wheelbase']
['width'] ['width']
['engine type'] ['engine type', 'enginetype']
['price'] ['price']
['aspiration'] ['aspiration']
['curb weight'] ['curb weight', 'curbweight']
['drive wheels', 'drive wheel'] ['drive wheels', 'drivewheels', 'drivewheel']
['fuel system'] ['fuel system', 'fuelsystem']
['engine location'] ['engine location', 'enginelocation']
['body style'] ['body style', 'bodystyle']
['peak rpm'] ['peak rpm', 'peakrpm']
['fuel type'] ['fuel type', 'fueltype']
['height'] ['height']
['compression ratio'] ['compression ratio', 'compressionratio']
['horsepower'] ['horsepower']
['engine size'] ['engine size', 'enginesize']
['stroke'] ['stroke']
['unique options', 'unique option'] ['unique operands', 'uniqueoperands', 'uniqueoperand']
['design complexity'] ['design complexity', 'designcomplexity']
['defects', 'defect'] ['defects', 'defect']
['decision count'] ['decision count', 'decisioncount']
['multiple content c

['education allowance'] ['education allowance', 'educationallowance']
['statutory holidays', 'statutory holiday'] ['statutory holidays', 'statutoryholidays', 'statutoryholiday']
['cost of living adjustment', 'cost of live adjustment'] ['cost of living adjustment', 'costoflivingadjustment']
['pension'] ['pension']
['vacation'] ['vacation']
['wage increase third year'] ['wage increase third year', 'wageincreasethirdyear']
['wage increase first year'] ['wage increase first year', 'wageincreasefirstyear']
['class'] ['class']
['long term disability assistance', 'longterm disability assistance'] ['longterm disability assistance', 'longtermdisabilityassistance']
['bypass'] ['by pass', 'bypass']
['region of origin'] ['regeneration of', 'regenerationof']
['exclusion of number'] ['exclusion of no', 'exclusionofno']
['special forms', 'special form'] ['special forms', 'specialforms', 'specialform']
['desalination of'] ['dislocation of', 'dislocationof']
['classification'] ['class']
['contrapptive 

['middle middle square'] ['middle middle square', 'middlemiddlesquare']
['bottom left square', 'bottom leave square'] ['bottom left square', 'bottomleftsquare']
['top middle square'] ['top middle square', 'topmiddlesquare']
['density'] ['density']
['sugar content'] ['sulphates', 'sulphate']
['alcohol content'] ['alcohol']
['classification'] ['class']
['free sulfur dioxide'] ['free sulfur dioxide', 'freesulfurdioxide']
['citric acid'] ['citric acid', 'citricacid']
['volatility'] ['volatile acidity', 'volatileacidity']
['fixed acidity', 'fix acidity'] ['fixed acidity', 'fixedacidity']
['total sulfur dioxide'] ['total sulfur dioxide', 'totalsulfurdioxide']
['residual sugar'] ['residual sugar', 'residualsugar']
['chlorides', 'chloride'] ['chlorides', 'chloride']
['location'] ['looks', 'look']
['gender'] ['gender']
['urbanrural'] ['urbanrural']
['sports', 'sport'] ['sports', 'sport']
['goal'] ['goals', 'goal']
['money'] ['money']
['grades', 'grade'] ['grades', 'grade']
['grade'] ['grade']
[

['spleen palpable'] ['spleen palpable', 'spleenpalpable']
['hollows ratio', 'hollow ratio'] ['hollows ratio', 'hollowsratio']
['max length rectangle'] ['maxlength rectangularity', 'maxlengthrectangularity']
['class'] ['class']
['max length rectangle'] ['maxlength aspect ratio', 'maxlengthaspectratio']
['radius ratio'] ['radius ratio', 'radiusratio']
['curvy'] ['circularity']
['ellens', 'ellen'] ['elongatedness']
['scaled variance major', 'scale variance major'] ['scaled variance major', 'scaledvariancemajor']
['skewness about major'] ['skewness about major', 'skewnessaboutmajor']
['compactness'] ['compactness']
['scaled radius of gyration', 'scale radius of gyration'] ['scaled radius of gyration', 'scaledradiusofgyration']
['scatter ratio'] ['scatter ratio', 'scatterratio']
['scaled variance minor', 'scale variance minor'] ['scaled variance minor', 'scaledvarianceminor']
['eccentricity'] ['distance circularity', 'distancecircularity']
['skew about minor'] ['skewness about minor', 'skew

['state'] ['state']
['area code'] ['area code', 'areacode']
['total evening minutes', 'total evening minute'] ['total eve minutes', 'totaleveminutes', 'totaleveminute']
['number of customer service calls', 'number of customer service call'] ['number customer service calls', 'numbercustomerservicecalls', 'numbercustomerservicecall']
['phone number'] ['phone number', 'phonenumber']
['total evening charge'] ['total eve charge', 'totalevecharge']
['total night charge'] ['total night charge', 'totalnightcharge']
['class'] ['class']
['account length'] ['account length', 'accountlength']
['contributions to health plan', 'contribution to health plan'] ['contribution to health plan', 'contributiontohealthplan']
['wrong hours', 'wrong hour'] ['working hours', 'workinghours', 'workinghour']
['standby pay'] ['standby pay', 'standbypay']
['shift differential'] ['shift differential', 'shiftdifferential']
['wage increases second year', 'wage increase second year'] ['wage increase second year', 'wagei

['cap surface'] ['cap surface', 'capsurface']
['stalk surface above ring'] ['stalk surface above ring', 'stalksurfaceabovering', 'stalksurfaceabovere']
['spore print color'] ['spore print color', 'sporeprintcolor']
['veil type'] ['veil type', 'veiltype']
['gill color'] ['gill color', 'gillcolor']
['gill size'] ['gill size', 'gillsize']
['population'] ['population']
['habitat'] ['habitat']
['stalk color below ring'] ['stalk color below ring', 'stalkcolorbelowring', 'stalkcolorbelowre']
['veil color'] ['veil color', 'veilcolor']
['ring number'] ['ring number', 'ringnumber']
['gill spacing', 'gill space'] ['gill spacing', 'gillspacing', 'gillspace']
['stalk shape'] ['stalk shape', 'stalkshape']
['stalk root'] ['stalk root', 'stalkroot']
['cap color'] ['cap color', 'capcolor']
['ring type'] ['ring type', 'ringtype']
['class'] ['class']
['top right square'] ['top right square', 'toprightsquare']
['class'] ['class']
['middle left square', 'middle leave square'] ['middle left square', 'middle

['proline'] ['proline']
['row green mean'] ['rawgreen mean', 'rawgreenmean']
['row blue mean'] ['rawblue mean', 'rawbluemean']
['short line density 2', 'short line density2'] ['short line density 2', 'shortlinedensity2']
['strain mean'] ['saturation mean', 'saturationmean']
['hue mean'] ['hue mean', 'huemean']
['heading mean', 'head mean'] ['hedge mean', 'hedgemean']
['region pixel count'] ['region pixel count', 'regionpixelcount']
['short line density 5', 'short line density5'] ['short line density 5', 'shortlinedensity5']
['intensity mean'] ['intensity mean', 'intensitymean']
['value mean'] ['value mean', 'valuemean']
['region center color'] ['region centroid col', 'regioncentroidcol']
['class'] ['class']
['region centered row', 'region center row'] ['region centroid row', 'regioncentroidrow']
['class'] ['class']
['steroids', 'steroid'] ['steroid']
['biopsy'] ['bilirubin']
['vaccines', 'vaccine'] ['varices', 'varix']
['fatigue'] ['fatigue']
['ascites', 'ascite'] ['ascites', 'ascite']

['attribute 5', 'attribute5'] ['attribute 5', 'attribute5']
['attribute 4', 'attribute4'] ['attribute 4', 'attribute4']
['attribute 8', 'attribute8'] ['attribute 8', 'attribute8']
['class'] ['class']
['attribute 9', 'attribute9'] ['attribute 9', 'attribute9']
['bid close'] ['bid close', 'bidclose']
['class'] ['class']
['ask volume'] ['ask volume', 'askvolume']
['bid open'] ['bid open', 'bidopen']
['bid volume'] ['bid volume', 'bidvolume']
['ask high'] ['ask high', 'askhigh']
['bid high'] ['bid high', 'bidhigh']
['ask open'] ['ask open', 'askopen']
['ask close'] ['ask close', 'askclose']
['title vechicles', 'title vechicle'] ['total eve calls', 'totalevecalls', 'totalevecall']
['total day minutes', 'total day minute'] ['total day minutes', 'totaldayminutes', 'totaldayminute']
['total night minutes', 'total night minute'] ['total night minutes', 'totalnightminutes', 'totalnightminute']
['total night calls', 'total night call'] ['total night calls', 'totalnightcalls', 'totalnightcall']
['

In [66]:
len(f1_scores_0), len(f1_scores_1), len(f1_scores_3), len(f1_scores_5), len(f1_scores_10), len(em_scores_0), len(em_scores_1), len(em_scores_3), len(em_scores_5), len(em_scores_10)

(234, 234, 234, 234, 234, 234, 234, 234, 234, 234)

In [67]:
# calculate the avg f1 and em scores
avg_f1_0 = mean(f1_scores_0)
avg_f1_1 = mean(f1_scores_1)
avg_f1_3 = mean(f1_scores_3)
avg_f1_5 = mean(f1_scores_5)
avg_f1_10 = mean(f1_scores_10)
avg_em_0 = mean(em_scores_0)
avg_em_1 = mean(em_scores_1)
avg_em_3 = mean(em_scores_3)
avg_em_5 = mean(em_scores_5)
avg_em_10 = mean(em_scores_10)
avg_f1 = [avg_f1_0, avg_f1_1, avg_f1_3, avg_f1_5, avg_f1_10]
avg_em = [avg_em_0, avg_em_1, avg_em_3, avg_em_5, avg_em_10]
avg_f1, avg_em

([0.945578867298925,
  0.9560033205227975,
  0.9658105131397899,
  0.9644685535349398,
  0.9626528776099539],
 [0.717948717948718,
  0.7649572649572649,
  0.7991452991452992,
  0.7991452991452992,
  0.7991452991452992])

In [68]:
avg_token

[89, 146.15, 223.35, 299.45, 490]

## Normalized F1 scores for experiment 1

In [69]:
for f1, token in zip(avg_f1, avg_token):
    if token == 89:
        print(f1 / 89.65) # In experiment 2, we slightly changed the query format and then the token length for the base line was 89.65 instead of 89, thats why we changed it here
    else:
        print(f1 / token)

0.01054744971889487
0.0065412474890372725
0.0043242019840599505
0.003220799978410218
0.001964597709408069


## Normalized EM scores for experiment 1

In [70]:
for f1, token in zip(avg_em, avg_token):
    if token == 89:
        print(f1 / 89.65)
    else:
        print(f1 / token)

0.008008351566633775
0.005234055866967259
0.003577995518895452
0.002668710299366503
0.0016309087737659167


# Experiment 2: title, description

In [75]:
list_dids = [9,1059,40708,24,36,55,50,40691,1100,185,171,40693,41760,40701,54,4,10,23,13,187]

In [14]:
# This code generates all predictions for experiment 2 for all query types
num = 1
pred_dct_title_desc = dict() #comment out if loaded part of preds in from pkl file
token_length_base = []
token_length_title = []
token_length_desc = []
token_length_title_desc = []

for did in list_dids:
    dataset = openml.datasets.get_dataset(did)
    title = dataset.name
    description = dataset.description
    df, _, _, _ = dataset.get_data(dataset_format="dataframe")
    # Find the non-cryptic columns and convert them to cryptic columns (so we have X and y)
    y = detect_non_cryptic(df)
    X = generate_cryptic(y)

    # Give the cryptic names as input to GPT combined with an n number of instances and ask
    # it to generate non-cryptic names
    gpt_predictions_baseline, t_base = generate_non_cryptic(df, y, X, 0)
    gpt_predictions_title, t_title = generate_non_cryptic(df, y, X, 0, title=title)
    gpt_predictions_desc, t_desc = generate_non_cryptic(df, y, X, 0, description=description)
    gpt_predictions_title_desc, t_title_desc = generate_non_cryptic(df, y, X, 0, title, description)

    if (len(y) != len(gpt_predictions_baseline)) or (len(y) != len(gpt_predictions_title)) or (len(y) != len(gpt_predictions_desc)) or (len(y) != len(gpt_predictions_title_desc)):
        print(f"Something went wrong with the predictions in dataset number {num}")
        print("The number of predictions do not align with the number of ground truth values")
        print(f"Length y: {len(y)}, y_pred_baseline: {len(gpt_predictions_baseline)}, y_pred_title: {len(gpt_predictions_title)}, y_pred_desc: {len(gpt_predictions_desc)}, y_pred_title_desc: {len(gpt_predictions_title_desc)}")
        print("We will not add this dataset to the predictions")
        continue
    else:
        # Store the predictions in a dataframe
        df_predictions = pd.DataFrame()
        df_predictions['y'] = y
        df_predictions['X'] = X
        df_predictions['y_pred_baseline'] = gpt_predictions_baseline
        df_predictions[f'y_pred_title'] = gpt_predictions_title
        df_predictions[f'y_pred_desc'] = gpt_predictions_desc
        df_predictions[f'y_pred_title_desc'] = gpt_predictions_title_desc

        # To compute the bertscore F1 in the end, we will divide by the average token length for every query
        token_length_base.append(t_base)
        token_length_title.append(t_title)
        token_length_desc.append(t_desc)
        token_length_title_desc.append(t_title_desc)

        # Store the predictions with the token lengths in a dictionary, which we can use to compute the scores
        token_lengths = {"token_length" : [t_base, t_title, t_desc, t_title_desc]}
        pred_dct_title_desc[f"pred_df_{num}"] = [df_predictions, token_lengths]
    num += 1
pred_dct_title_desc


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: engine-Lctn | strk | engi-Type | body-styl | drve-whls | peak-Rpm | aspn | engn-Size | wdth | compr-rat | fuel-Type | whl-Base | lnth | hrspwr | heig | pric | fuel-Syst | curb-Weight stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: autos 


        ,
the abbreviated column names: engine-Lctn | strk | engi-Type | body-styl | drve-whls | peak-Rpm | aspn | engn-Size | wdth | compr-rat | fuel-Type | whl-Base | lnth | hrspwr | heig | pric | fuel-Syst | curb-Weight stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)   
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Automobile


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: dcsn_cnt | callpairs | unqeoper | brnchCnt | dfcs | dcsnDnsy | ttloper | mlpl_cndtn_cnt | condition_cont | dsgn_cmlt | frmlPmts | ttalOprr | unqe_oprtrs | dsgndensity stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: ar1 


        ,
the abbreviated column names: dcsn_cnt | callpairs | unqeoper | brnchCnt | dfcs | dcsnDnsy | ttloper | mlpl_cndtn_cnt | condition_cont | dsgn_cmlt | frmlPmts | ttalOprr | unqe_oprtrs | dsgndensity stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
This is a PROMISE Softwa


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: thrdSrgr | tumr | queryhypot | prgnnt | TSHmeas | gtr | lith | hpptry | rfrl_sour | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: allrep 


        ,
the abbreviated column names: thrdSrgr | tumr | queryhypot | prgnnt | TSHmeas | gtr | lith | hpptry | rfrl_sour | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: allrep-pmlb 

        ,
the abbreviated column names: thrdSrgr | tumr | queryhypot | prgnnt | TSHmeas | gtr | lith | hpptry | rfrl_sour | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: allrep 
with description: allrep-pmlb 

 


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: cap-shpe | stlk-Srfc-Blw-Ring | gill-clr | spor-Prnt-Color | cap-clr | ring-numb | stal-root | ring-Type | clas | stlk-shpe | stlk-Colr-Abv-Ring | ppltn | gill-Size | hbtt | veil-Type | stlk-srfc-above-ring | gill-Spac | gill-Atta | stlk-color-belo-ring | veil-Colo | cap-Surf stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: mushroom 


        ,
the abbreviated column names: cap-shpe | stlk-Srfc-Blw-Ring | gill-clr | spor-Prnt-Color | cap-clr | ring-numb | stal-root | ring-Type | clas | stlk-shpe | stlk-Colr-Abv-Ring | ppltn | gill-Size | hbtt | veil-Type | stlk-srfc-above-ring | gill-Spac | gill-Atta | stlk-color-belo-ring | veil-Colo | cap-Surf stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Prod


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: region-cntr-col | intnsty-mean | hue-Mean | srtn-mean | shor-line-dnty-5 | hdge-mean | shrt-Line-Dnst-2 | rgn-cnrd-row | clss | val-Mean | rwbl-mean | rwgr-Mean | rgn-pxl-cont stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: segment 


        ,
the abbreviated column names: region-cntr-col | intnsty-mean | hue-Mean | srtn-mean | shor-line-dnty-5 | hdge-mean | shrt-Line-Dnst-2 | rgn-cnrd-row | clss | val-Mean | rwbl-mean | rwgr-Mean | rgn-pxl-cont stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: University of Massachusetts Vision Group, Carla Brodley  

**Source**: [UCI](http://archive.ics.uci.edu/ml/datasets/image+segmentation) - 19


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Clss | HIST | ASCS | AIRA | SPER | BLUB | LIVERFIRM | MAIS | VARI | ABIN | SLEN_PLPL | FIGE | ARXI | SOID stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: hepatitis 


        ,
the abbreviated column names: Clss | HIST | ASCS | AIRA | SPER | BLUB | LIVERFIRM | MAIS | VARI | ABIN | SLEN_PLPL | FIGE | ARXI | SOID stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

1. Title: Hepatitis Domain
 
 2. Sources:
      (a) unknown
      (b) Donor: G.Gong  (Carnegie-Mellon University) via 
                    Bojan Cestnik
                    Jozef Stefan Institute
                    Jamova 39
   


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Clss | botm-Rght-Squa | middle-left-sqar | mddl-middle-sqr | mdle-Rght-Squr | top-Middle-Squa | botm-left-sqr | top-Left-Sqre | top-rght-sqre | bttm-midd-sqr stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: tic-tac-toe 


        ,
the abbreviated column names: Clss | botm-Rght-Squa | middle-left-sqar | mddl-middle-sqr | mdle-Rght-Squr | top-Middle-Squa | botm-left-sqr | top-Left-Sqre | top-rght-sqre | bttm-midd-sqr stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: David W. Aha    

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Tic-Tac-Toe+Endgame) - 1991   

**Please cite**: [UCI](http://archive.ics.uci.edu/ml/citation_po


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: slphts | free_slfr_dxde | chlrds | resisgr | dnsty | citricAcid | alchl | vola_adty | clss | totaSlfrDxd | fixedAcdty stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: wine-quality-red 


        ,
the abbreviated column names: slphts | free_slfr_dxde | chlrds | resisgr | dnsty | citricAcid | alchl | vola_adty | clss | totaSlfrDxd | fixedAcdty stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: wine-quality-red-pmlb 

        ,
the abbreviated column names: slphts | free_slfr_dxde | chlrds | resisgr | dnsty | citricAcid | alchl | vola_adty | clss | totaSlfrDxd | fixedAcdty stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand fo


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Lks | Grde | Urbn/Rral | Grds | Gndr | Gals | Spts | Schl | Mony stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: PopularKids 


        ,
the abbreviated column names: Lks | Grde | Urbn/Rral | Grds | Gndr | Gals | Spts | Schl | Mony stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

Datasets of Data And Story Library, project illustrating use of basic statistic methods, converted to arff format by Hakan Kjellerstrand.
Source: TunedIT: http://tunedit.org/repo/DASL

DASL file http://lib.stat.cmu.edu/DASL/Datafiles/PopularKids.html

Students' Gals
,

What Makes Kids Popular

Re


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Posi | Stri | Wlks | Gmsplyd | HallOfFame | Doub | AtBats | Nmbr_ssns | Trls | Batting_avrg stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: baseball 


        ,
the abbreviated column names: Posi | Stri | Wlks | Gmsplyd | HallOfFame | Doub | AtBats | Nmbr_ssns | Trls | Batting_avrg stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: Database of baseball players and play statistics, including 'Gmsplyd', 'AtBats', 'Runs', 'Hits', 'Doub', 'Trls', 'Home_runs', 'RBIs', 'Wlks', 'Stri', 'Batting_avrg', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave' 

Notes:  
* Quotes, Single-Quotes and Backslashes were removed, Blanks replaced with Underscores
* Play


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: media | plr | bone-mrrw | brn | lvr | abdo | peri | clas | hstlgc-type stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: primary-tumor 


        ,
the abbreviated column names: media | plr | bone-mrrw | brn | lvr | abdo | peri | clas | hstlgc-type stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

Citation Request:
    This primary tumor domain was obtained from the University Medical Centre,
    Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
    M. Soklic for providing the data.  Please include this citation if you plan
    to use this database.
 
 1. Title:

Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: primary-tumor 
with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

Citation Request:
    This primary tumor domain was obtained from the University Medical Centre,
    Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
    M. Soklic for providing the data.  Please include this citation if you plan
    to use this database.
 
 1. Title: Primary Tumor Domain
 
 2. Sources:
      (a) Source:
      (b) Donors: Igor Kononenko, 
                  University E.Kardelj
                  Faculty for electrical engineering
                  Trzaska 25
                  61000 Ljubljana (tel.: (38)(+61) 265-161
 
                  Bojan Cestnik
                  Jozef Stefan Institute
                  Jamova 39
                  61000 Ljubljana
                  Yugoslavia (tel.: (38)(+61) 214-399 ext.287) 
      (


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Arbt1 | Attr2 | Atrt_7 | Attr_9 | Attrbt8 | Attr3 | Attrbt5 | clas | Attrbt4 | Attrbt_6 stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: xd6 


        ,
the abbreviated column names: Arbt1 | Attr2 | Atrt_7 | Attr_9 | Attrbt8 | Attr3 | Attrbt5 | clas | Attrbt4 | Attrbt_6 stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: Unknown  

**Source**: [PMLB](https://github.com/EpistasisLab/penn-ml-benchmarks/tree/master/datasets/clasification) - Supposedly originates from UCI, but can't find it there anymore.  

**Please cite:**  



**XD6 Dataset**

Dataset used by Buntine and Niblett (1992). Composed of 10 features, one of which is irrelevan


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Clas | AskOpen | BidOpen | BidHigh | AskVlme | BidClos | BidVlm | Ask_Cls | AskHigh stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: FOREX_eurhuf-day-High 


        ,
the abbreviated column names: Clas | AskOpen | BidOpen | BidHigh | AskVlme | BidClos | BidVlm | Ask_Cls | AskHigh stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Source**: Dukascopy Historical Data Feed https://www.dukascopy.com/swiss/english/marketwatch/historical/
**Edited by**: Fabian Schut
 
# Data Description
This is the historical price data of the FOREX EUR/HUF from Dukascopy.
One instance (row) is one candlestick of one day.
The whole dataset has the data range from 1-1


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: totleveclls | ttldaycall | ttlDayChrg | areaCode | totaNighCals | clss | accolngth | stt | ttl_nght_mnts | phonNmbr | internPlan | totl_day_mnts | ttlEveMnts | nmbr_customer_srvc_call | voicmailplan | ttalNighChrg | ttlevechrg stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: churn 


        ,
the abbreviated column names: totleveclls | ttldaycall | ttlDayChrg | areaCode | totaNighCals | clss | accolngth | stt | ttl_nght_mnts | phonNmbr | internPlan | totl_day_mnts | ttlEveMnts | nmbr_customer_srvc_call | voicmailplan | ttalNighChrg | ttlevechrg stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: Unknown  

**Source**: [PMLB](https://gi


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: MAX.LENG_RECTANGULARITY | HOWSRATI | Clas | SAERRATIO | SKEW_ABOUT_MAJOR | SCALVARIMAJOR | SCLDRADIOFGYRATION | MAX.LENGASPECTRATIO | ELONGA | SKEWNESSABOUTMINOR | SLEDVARIANCEMINO | RADIRATO | DISTANCE_CIRCU | CPCTE | CIRCU stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: vehicle 


        ,
the abbreviated column names: MAX.LENG_RECTANGULARITY | HOWSRATI | Clas | SAERRATIO | SKEW_ABOUT_MAJOR | SCALVARIMAJOR | SCLDRADIOFGYRATION | MAX.LENGASPECTRATIO | ELONGA | SKEWNESSABOUTMINOR | SLEDVARIANCEMINO | RADIRATO | DISTANCE_CIRCU | CPCTE | CIRCU stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: Dr. Pete Mowforth and Dr. Barry Shepherd  

Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: vehicle 
with description: **Author**: Dr. Pete Mowforth and Dr. Barry Shepherd  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Statlog+(Vehicle+Silhouettes))

**Please cite**: Siebert,JP. Turing Institute Research Memorandum TIRM-87-018 "Vehicle Recognition Using Rule Based Methods" (March 1987)  



 NAME

         vehicle silhouettes

 

 PURPOSE

         to classify a given silhouette as one of four types of vehicle,

         using  a set of features extracted from the silhouette. The

         vehicle may be viewed from one of many different angles.  

 

 PROBLEM TYPE

         classification

         

 SOURCE

         Drs.Pete Mowforth and Barry Shepherd

         Turing Institute

         George House

         36 North Hanover St.

         Glasgow

         G1 2AD

 

 CONTACT

         Alistair Sutherland

         Statistics


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: shif-differential | stttry-Holi | vctn | drtn | brvmnt-assstnc | stan-pay | clss | contribution-to-dntl-plan | pnsn | cntrbtn-to-hlth-plan | long-Dsblty-Astn | cost-Of-Livi-Adju | wkng-hrs | wage-incs-thrd-year | ectn-allwnc | wage-Icrs-Frst-Year | wage-Incrs-Seco-Year stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: labor 


        ,
the abbreviated column names: shif-differential | stttry-Holi | vctn | drtn | brvmnt-assstnc | stan-pay | clss | contribution-to-dntl-plan | pnsn | cntrbtn-to-hlth-plan | long-Dsblty-Astn | cost-Of-Livi-Adju | wkng-hrs | wage-incs-thrd-year | ectn-allwnc | wage-Icrs-Frst-Year | wage-Incrs-Seco-Year stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.



Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: excl_of_no | spclfrms | dslctnof | rgnrtnof | bypass | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: lymph 


        ,
the abbreviated column names: excl_of_no | spclfrms | dslctnof | rgnrtnof | bypass | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

Citation Request:
    This lymphography domain was obtained from the University Medical Centre,
    Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
    M. Soklic for providing the data.  Please include this citation if you plan
    to use this database.
 
 1. Title: Lymphography Domain
 
 2. Sour


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: WifsEducation | NumbOfChlrEverBorn | Contramthdused | Sndr-Of-LivingIndx | WfsReli | Hsbdoccptn | Hsbndsedtn | Media_expr | Wfsage stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: cmc 


        ,
the abbreviated column names: WifsEducation | NumbOfChlrEverBorn | Contramthdused | Sndr-Of-LivingIndx | WfsReli | Hsbdoccptn | Hsbndsedtn | Media_expr | Wfsage stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**: [Tjen-Sien Lim](limt@stat.wisc.edu) 

**Source**: [As obtained from UCI](https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice)

**Please cite**: [UCI citation](https://archive.ics.uci.edu/ml/citation_policy.html)



1. 


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Clss | node-Caps | tmor-Size | brst | meno | brst-Quad stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: breast-cancer 


        ,
the abbreviated column names: Clss | node-Caps | tmor-Size | brst | meno | brst-Quad stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

Citation Request:
    This brst cancer domain was obtained from the University Medical Centre,
    Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
    M. Soklic for providing the data.  Please include this citation if you plan
    to use this database.
 
 1. Title: Breast cancer data (Michalski has


Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.



Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset,
the abbreviated column names: Mgnsm | TtlPhnls | Clr_inst | Alchl | Prln | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset
with title: wine 


        ,
the abbreviated column names: Mgnsm | TtlPhnls | Clr_inst | Alchl | Prln | clas stand for
Abbreviated column names from a dataset: c_name | pCd | dt stand for Customer Name | Product Code | Date.
From a dataset

with description: **Author**:   
**Source**: Unknown -   
**Please cite**:   

1. Title of Database: Wine recognition data
 	Updated Sept 21, 1998 by C.Blake : Added attribute information
 
 2. Sources:
    (a) Forina, M. et al, PARVUS - An Extendible Package for Data
        Exploration, Classification and Correlation. Institute of Pharmaceutical
        and Food Analysis and Technologies, Via Brigata Salerno, 
        1

{'pred_df_1': [                    y            X    y_pred_baseline       y_pred_title  \
  0     engine-location  engine-Lctn    Engine Location    Engine Location   
  1              stroke         strk             Stroke             Stroke   
  2         engine-type    engi-Type        Engine Type        Engine Type   
  3          body-style    body-styl         Body Style         Body Style   
  4        drive-wheels    drve-whls       Drive Wheels       Drive Wheels   
  5            peak-rpm     peak-Rpm           Peak RPM           Peak RPM   
  6          aspiration         aspn         Aspiration         Aspiration   
  7         engine-size    engn-Size        Engine Size        Engine Size   
  8               width         wdth              Width              Width   
  9   compression-ratio    compr-rat  Compression Ratio  Compression Ratio   
  10          fuel-type    fuel-Type          Fuel Type          Fuel Type   
  11         wheel-base     whl-Base         Wheel 

In [16]:
# Save the preditions to a pickle file
with open('data/complete_preds_experiment_2.pkl', 'wb') as handle:
    pickle.dump(pred_dct_title_desc, handle)

In [84]:
# Load the predictions in
with open('data/complete_preds_experiment_2.pkl', 'rb') as handle:
    pred_dct_title_desc = pickle.load(handle)
pred_dct_title_desc

{'pred_df_1': [                    y            X    y_pred_baseline       y_pred_title  \
  0     engine-location  engine-Lctn    Engine Location    Engine Location   
  1              stroke         strk             Stroke             Stroke   
  2         engine-type    engi-Type        Engine Type        Engine Type   
  3          body-style    body-styl         Body Style         Body Style   
  4        drive-wheels    drve-whls       Drive Wheels       Drive Wheels   
  5            peak-rpm     peak-Rpm           Peak RPM           Peak RPM   
  6          aspiration         aspn         Aspiration         Aspiration   
  7         engine-size    engn-Size        Engine Size        Engine Size   
  8               width         wdth              Width              Width   
  9   compression-ratio    compr-rat  Compression Ratio  Compression Ratio   
  10          fuel-type    fuel-Type          Fuel Type          Fuel Type   
  11         wheel-base     whl-Base         Wheel 

In [78]:
def f1_scores_per_type(pred_dct, typ):
    ''' Function to generate f1 and em scores per query type'''
    f1_scores = []
    em_scores = []
    for key, value in pred_dct.items():
        predictions_1_truth = value[0]['y']
        predictions_1_0 = value[0][f'y_pred_{typ}']
        for references, predictions in zip(predictions_1_truth, predictions_1_0):
            refs = normalize_answer(references)
            preds = normalize_answer(predictions)
            f1_sub_scores = []
            em_sub_scores = []
            print(preds, refs)
            for pred in preds:
                for ref in refs:
                    f1_sub_scores.append(bertscore.compute(predictions=[pred],references=[ref],model_type="distilbert-base-uncased")['f1'][0])
                    if pred == ref:
                        em_sub_scores.append(1)
                    else:
                        em_sub_scores.append(0)
            f1_scores.append(max(f1_sub_scores))
            em_scores.append(max(em_sub_scores))
    return f1_scores, em_scores
f1_scores_baseline, em_scores_bs = f1_scores_per_type(pred_dct_title_desc, 'baseline')
f1_scores_title, em_scores_tit = f1_scores_per_type(pred_dct_title_desc, 'title')
f1_scores_desc, em_scores_desc = f1_scores_per_type(pred_dct_title_desc, 'desc')
f1_scores_title_desc, em_scores_tit_desc = f1_scores_per_type(pred_dct_title_desc, 'title_desc')

['engine location'] ['engine location', 'enginelocation']
['stroke'] ['stroke']
['engine type'] ['engine type', 'enginetype']
['body style'] ['body style', 'bodystyle']
['drive wheels', 'drive wheel'] ['drive wheels', 'drivewheels', 'drivewheel']
['peak rpm'] ['peak rpm', 'peakrpm']
['aspiration'] ['aspiration']
['engine size'] ['engine size', 'enginesize']
['width'] ['width']
['compression ratio'] ['compression ratio', 'compressionratio']
['fuel type'] ['fuel type', 'fueltype']
['wheel base'] ['wheel base', 'wheelbase']
['length'] ['length']
['horsepower'] ['horsepower']
['height'] ['height']
['price'] ['price']
['fuel system'] ['fuel system', 'fuelsystem']
['curb weight'] ['curb weight', 'curbweight']
['decision count'] ['decision count', 'decisioncount']
['call pairs', 'call pair'] ['call pairs', 'callpairs', 'callpair']
['unique operators', 'unique operator'] ['unique operands', 'uniqueoperands', 'uniqueoperand']
['branch count'] ['branch count', 'branchcount']
['decision factors',

['phone number'] ['phone number', 'phonenumber']
['international plan'] ['international plan', 'internationalplan']
['total day minutes', 'total day minute'] ['total day minutes', 'totaldayminutes', 'totaldayminute']
['total evening minutes', 'total evening minute'] ['total eve minutes', 'totaleveminutes', 'totaleveminute']
['number of customer service calls', 'number of customer service call'] ['number customer service calls', 'numbercustomerservicecalls', 'numbercustomerservicecall']
['voicemail plan'] ['voice mail plan', 'voicemailplan']
['total night charges', 'total night charge'] ['total night charge', 'totalnightcharge']
['total evening charges', 'total evening charge'] ['total eve charge', 'totalevecharge']
['maximum length rectangularity'] ['maxlength rectangularity', 'maxlengthrectangularity']
['hows rati', 'how s rati'] ['hollows ratio', 'hollowsratio']
['class'] ['class']
['saer ratio'] ['scatter ratio', 'scatterratio']
['skew about major'] ['skewness about major', 'skewnes

['value mean'] ['value mean', 'valuemean']
['row blur mean'] ['rawblue mean', 'rawbluemean']
['row green mean'] ['rawgreen mean', 'rawgreenmean']
['region pixel count'] ['region pixel count', 'regionpixelcount']
['classification'] ['class']
['histology'] ['histology']
['ascites', 'ascite'] ['ascites', 'ascite']
['alk phosphate'] ['antivirals', 'antiviral']
['spermidine'] ['spiders', 'spider']
['bilirubin'] ['bilirubin']
['liver firm'] ['liver firm', 'liverfirm']
['malaise'] ['malaise']
['varices', 'varix'] ['varices', 'varix']
['albumin'] ['albumin']
['spleen palpable'] ['spleen palpable', 'spleenpalpable']
['figer'] ['fatigue']
['ascites', 'ascite'] ['anorexia']
['spider naevi'] ['steroid']
['class'] ['class']
['bottom right square'] ['bottom right square', 'bottomrightsquare']
['middle left square', 'middle leave square'] ['middle left square', 'middleleftsquare']
['middle middle square'] ['middle middle square', 'middlemiddlesquare']
['middle right square'] ['middle right square', '

['husbands occupation', 'husband occupation'] ['husbands occupation', 'husbandsoccupation']
['husbands education', 'husband education'] ['husbands education', 'husbandseducation']
['media exposure', 'medium exposure'] ['media exposure', 'mediaexposure']
['wifes age', 'wife age'] ['wifes age', 'wifesage']
['class'] ['class']
['node capsules', 'node capsule'] ['node caps', 'nodecaps', 'nodecap']
['tumor size'] ['tumor size', 'tumorsize']
['breast'] ['breast']
['menopause'] ['menopause']
['breast quadrant'] ['breast quad', 'breastquad']
['magnesium'] ['magnesium']
['total phenols', 'total phenol'] ['total phenols', 'totalphenols', 'totalphenol']
['color intensity'] ['color intensity', 'colorintensity']
['alcohol'] ['alcohol']
['proline'] ['proline']
['class'] ['class']
['engine location'] ['engine location', 'enginelocation']
['stroke'] ['stroke']
['engine type'] ['engine type', 'enginetype']
['body style'] ['body style', 'bodystyle']
['drive wheels', 'drive wheel'] ['drive wheels', 'driv

['bid volume'] ['bid volume', 'bidvolume']
['ask closing price'] ['ask close', 'askclose']
['ask highest price', 'ask high price'] ['ask high', 'askhigh']
['total number of level calls', 'total number of level call'] ['total eve calls', 'totalevecalls', 'totalevecall']
['total day calls', 'total day call'] ['total day calls', 'totaldaycalls', 'totaldaycall']
['total day charges', 'total day charge'] ['total day charge', 'totaldaycharge']
['area code'] ['area code', 'areacode']
['total night calls', 'total night call'] ['total night calls', 'totalnightcalls', 'totalnightcall']
['class'] ['class']
['account length'] ['account length', 'accountlength']
['state'] ['state']
['total night minutes', 'total night minute'] ['total night minutes', 'totalnightminutes', 'totalnightminute']
['phone number'] ['phone number', 'phonenumber']
['international plan'] ['international plan', 'internationalplan']
['total day minutes', 'total day minute'] ['total day minutes', 'totaldayminutes', 'totaldaymin

['gill attachment'] ['gill attachment', 'gillattachment']
['stalk color below ring'] ['stalk color below ring', 'stalkcolorbelowring', 'stalkcolorbelowre']
['veil color'] ['veil color', 'veilcolor']
['cap surface'] ['cap surface', 'capsurface']
['region center column'] ['region centroid col', 'regioncentroidcol']
['intensity mean'] ['intensity mean', 'intensitymean']
['hue mean'] ['hue mean', 'huemean']
['saturation mean'] ['saturation mean', 'saturationmean']
['short line density 5'] ['short line density 5', 'shortlinedensity5']
['hedge mean'] ['hedge mean', 'hedgemean']
['short line density 2'] ['short line density 2', 'shortlinedensity2']
['region center row'] ['region centroid row', 'regioncentroidrow']
['class'] ['class']
['value mean'] ['value mean', 'valuemean']
['raw blue mean'] ['rawblue mean', 'rawbluemean']
['raw green mean'] ['rawgreen mean', 'rawgreenmean']
['region pixel count'] ['region pixel count', 'regionpixelcount']
['class'] ['class']
['history'] ['histology']
['asc

['exclusion of no'] ['exclusion of no', 'exclusionofno']
['special forms', 'special form'] ['special forms', 'specialforms', 'specialform']
['dislocation of'] ['dislocation of', 'dislocationof']
['regeneration of'] ['regeneration of', 'regenerationof']
['bypass'] ['by pass', 'bypass']
['class'] ['class']
['wifes education', 'wife education'] ['wifes education', 'wifeseducation']
['number of children ever born', 'number of child ever bear'] ['number of children ever born', 'numberofchildreneverborn']
['contraceptive method used', 'contraceptive method use'] ['contraceptive method used', 'contraceptivemethodused', 'contraceptivemethoduse']
['standard of living index', 'standardofliving index', 'standardoflive index'] ['standard of living index', 'standardoflivingindex']
['wifes religion', 'wife religion'] ['wifes religion', 'wifesreligion']
['husbands occupation', 'husband occupation'] ['husbands occupation', 'husbandsoccupation']
['husbands education', 'husband education'] ['husbands ed

In [80]:
# average f1 scores
average_baseline = mean(f1_scores_baseline)
average_title = mean(f1_scores_title)
average_desc = mean(f1_scores_desc)
average_title_desc = mean(f1_scores_title_desc)
f1_scores = [average_baseline, average_title, average_desc, average_title_desc]
f1_scores

[0.9478237529595693, 0.962423782572787, 0.9572087732645181, 0.9645706275080004]

In [81]:
# average token lengths
token_len_bs = []
token_len_tit = []
token_len_desc = []
token_len_tit_desc = []

for df,tokens in pred_dct_title_desc.values():
    token_len_bs.append(tokens['token_length'][0])
    token_len_tit.append(tokens['token_length'][1])
    token_len_desc.append(tokens['token_length'][2])
    token_len_tit_desc.append(tokens['token_length'][3])
avg_tokens_bs = mean(token_len_bs)
avg_tokens_tit = mean(token_len_tit)
avg_tokens_desc = mean(token_len_desc)
avg_tokens_tit_desc = mean(token_len_tit_desc)
avg_tokens = [avg_tokens_bs, avg_tokens_tit, avg_tokens_desc, avg_tokens_tit_desc]
avg_tokens

[89.65, 97.8, 878.1, 884.25]

In [83]:
# average em scores
em_scores = [mean(em_scores_bs), mean(em_scores_tit), mean(em_scores_desc), mean(em_scores_tit_desc)]
em_scores

[0.7393162393162394, 0.7692307692307693, 0.7777777777777778, 0.811965811965812]

## Normalized F1 scores for experiment 2

In [67]:
for f1, token in zip(f1_scores, avg_tokens):
    print(f1 / token)

0.010572490272834013
0.009840733973136883
0.0010900908475851476
0.0010908347497970035


## Normalized EM scores for experiment 2

In [68]:
for em, token in zip(em_scores, avg_tokens):
    print(em / token)

0.00824669536325978
0.007865345288658173
0.000885750800339116
0.0009182536748270421
