# Download artifacts/data from wandb for figure generation

## Imports

In [None]:
import pandas as pd
import wandb
from tqdm.notebook import tqdm

## Download

In [None]:
def GetRuns(project, sweepid, endpoints):

    api = wandb.Api()

    # Project is specified by <entity/project-name>
    runs = api.sweep(project+'/'+sweepid).runs

    summary_list, config_list, name_list, ids = [], [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files

        if run.config['endpoints'] == endpoints:

            summary_list.append(run.summary._json_dict)

            # .config contains the hyperparameters.
            #  We remove special values that start with _.
            config_list.append(
                {k: v for k,v in run.config.items()
                if not k.startswith('_')})

            # .name is the human-readable name of the run.
            name_list.append(run.name)
            ids.append(run.id)

    return pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list,
        "run_ids": ids,
        'sweepid': sweepid
        })

In [None]:
def LoadArtifact(wandbrun, runid, entity, project, artifact_name, fname, version, type):
    artifact = wandbrun.use_artifact(f'{entity}/{project}/run-{runid}-{artifact_name}:{version}', type=type)
    artifact_dir = artifact.download()
    jsonfile = json.load(open(f'{artifact_dir}/{fname}.table.json'))

    cols = jsonfile['columns']
    data = jsonfile['data']
    df = pd.DataFrame(data=data, columns=cols)

    columns=['internal_id', 'Conc_sign', 'species_group', 'Pubchem_CID', 'xlogp', 'mw', 'Canonical_SMILES','Lineage','OneHotEnc_effect','OneHotEnc_endpoint']
    
    for col in columns:
        try:
            df.drop(columns=col, inplace=True)
        except:
            pass


    return df

In [None]:
def CombineKFoldPredictions(wandbrun, runs_df, project, version, artifact_name, name):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
    
    df[['seed', 'fold_id']] = [runs_df.config[0]['seed'], runs_df.config[0]['fold_id']]

    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')

        df2[['seed', 'fold_id']] = [runs_df.config[i]['seed'], runs_df.config[i]['fold_id']]

        df = pd.concat([df, df2], ignore_index=True)

    return df

def CombinePredictions(wandbrun, runs_df, project, version, artifact_name, name):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
    df[['base_model', 'loss_fun']] = [runs_df['base_model'][0], runs_df['loss_fun'][0]]
    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
        df2[['base_model', 'loss_fun']] = [runs_df['base_model'][i], runs_df['loss_fun'][i]]
        df = pd.concat([df, df2], ignore_index=True)

    return df

In [None]:
wandb.login()

In [None]:
wandbrun = wandb.init(project="artifacts-analysis", job_type='tmp_run')

In [None]:
PROJECT_NAME = 'Judson'
SWEEP_ID = 'jnkzjb20'
ENDPOINT = ['EC50']
ARTIFACT_NAME = 'BestValidationResults'
DOWNLOADED_ARTIFACT_NAME = 'Best Validation Results' #Split ARTIFACT_NAME by capital letters
ARTIFACT_VERSION = 'v0'
FILENAME = '' #Filename to which results will be saved

In [None]:
runs_df = GetRuns(project=f"ecotoxformer/{PROJECT_NAME}/", sweepid=SWEEP_ID, endpoints=ENDPOINT)

K-Fold CV

In [None]:
concatenated_results = CombineKFoldPredictions(
    wandbrun=wandbrun, 
    runs_df=runs_df,
    project=PROJECT_NAME, 
    version=ARTIFACT_VERSION, 
    artifact_name=ARTIFACT_NAME,
    name=DOWNLOADED_ARTIFACT_NAME)

  0%|          | 0/4 [00:00<?, ?it/s]

Base model runs

In [None]:
concatenated_results = pd.DataFrame()
runs_df = pd.DataFrame(['56vivdmi','dl8u33ui','78lq56v1','128aaus7'], columns=['run_ids'])
runs_df['loss_fun'] = ['L1Loss','L1Loss','MSELoss','MSELoss']
runs_df['base_model'] = ['seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k','seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k']
for i in tqdm(range(5)):
    df = CombinePredictions(wandbrun, runs_df, 'base_model_sweep_RDKit', 'v0', f'BestValidationResults{i+1}', f'Best Validation Results {i+1}')
    df['fold_id'] = i+1
    concatenated_results = pd.concat([concatenated_results, df],ignore_index=True)

Save results

In [None]:
concatenated_results.to_csv(f'../../data/results/{FILENAME}.csv', index=False, compression='zip')

In [None]:
concatenated_results.to_pickle(f'../../data/results/{FILENAME}.pkl', compression='zip')