# Download artifacts/data from wandb for figure generation

**Purpose:** This script is used to download artifacts from weights and biases for retrieving results for the 10x10 K-fold cross-validations and the chemberta version and loss-function sweeps.

**Dependency:** `hyperparameter_sweep.ipynb`, `Kfold_crossvalidation_sweep.ipynb`. This script requires results being added to weights and biases. The chemberta version and loss function results (`hyperparameter_sweep.ipynb`) and the 10x10 K-fold cross-validation (`Kfold_crossvalidation_sweep.ipynb`).

**Consecutive scripts:** After running this script the following scripts may be executed. `generate_figures_for_publication.ipynb`

## Imports

In [1]:
import pandas as pd
import wandb
import json
import os
from tqdm.notebook import tqdm

## Download

In [2]:
def GetRuns(project, sweepid, endpoints):

    api = wandb.Api()

    # Project is specified by <entity/project-name>
    runs = api.sweep(project+'/'+sweepid).runs

    summary_list, config_list, name_list, ids = [], [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files

        if run.config['endpoints'] == endpoints:

            summary_list.append(run.summary._json_dict)

            # .config contains the hyperparameters.
            #  We remove special values that start with _.
            config_list.append(
                {k: v for k,v in run.config.items()
                if not k.startswith('_')})

            # .name is the human-readable name of the run.
            name_list.append(run.name)
            ids.append(run.id)

    return pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list,
        "run_ids": ids,
        'sweepid': sweepid
        })

In [3]:
def LoadArtifact(wandbrun, runid, entity, project, artifact_name, fname, version, type, save_cls_embeddings):
    if os.path.isdir(f'./artifacts/run-{runid}-{artifact_name}-{version}'):
        jsonfile = json.load(open(f'./artifacts/run-{runid}-{artifact_name}-{version}/{fname}.table.json'))
    else:
        artifact = wandbrun.use_artifact(f'{entity}/{project}/run-{runid}-{artifact_name}:{version}', type=type)
        artifact_dir = artifact.download()
        jsonfile = json.load(open(f'{artifact_dir}/{fname}.table.json'))

    cols = jsonfile['columns']
    data = jsonfile['data']
    df = pd.DataFrame(data=data, columns=cols)

    columns=['internal_id', 'Conc_sign', 'species_group', 'Pubchem_CID', 'xlogp', 'mw', 'Canonical_SMILES','Lineage','OneHotEnc_effect','OneHotEnc_endpoint']
    
    if save_cls_embeddings == False:
        print('Dropping CLS embeddings\n')
        columns.append('CLS_embeddings')

    for col in columns:
        try:
            df.drop(columns=col, inplace=True)
        except:
            pass


    return df

In [4]:
def CombineKFoldPredictions(wandbrun, runs_df, project, version, artifact_name, name, save_cls_embeddings):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table',
            save_cls_embeddings=save_cls_embeddings)
    
    df[['seed', 'fold_id']] = [runs_df.config[0]['seed'], runs_df.config[0]['fold_id']]

    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table',
            save_cls_embeddings=save_cls_embeddings)

        df2[['seed', 'fold_id']] = [runs_df.config[i]['seed'], runs_df.config[i]['fold_id']]

        df = pd.concat([df, df2], ignore_index=True)

    return df

def CombinePredictions(wandbrun, runs_df, project, version, artifact_name, name, save_cls_embeddings):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table',
            save_cls_embeddings=save_cls_embeddings)
    
    df[['base_model', 'loss_fun']] = [runs_df['base_model'][0], runs_df['loss_fun'][0]]
    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table',
            save_cls_embeddings=save_cls_embeddings)
        df2[['base_model', 'loss_fun']] = [runs_df['base_model'][i], runs_df['loss_fun'][i]]
        df = pd.concat([df, df2], ignore_index=True)

    return df

In [5]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstyrbjornkall[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
wandbrun = wandb.init(project="artifacts-analysis", job_type='tmp_run')

In [7]:
PROJECT_NAME = '100Fold_CV_RDKit_invertebrates'
SWEEP_ID = 'u1c9loib'
ENDPOINT = ['EC50','EC10','NOEC']
SAVE_CLS_EMBEDDINGS = False
ARTIFACT_NAME = 'BestValidationResults'
DOWNLOADED_ARTIFACT_NAME = 'Best Validation Results' #Split ARTIFACT_NAME by capital letters
ARTIFACT_VERSION = 'v0'
FILENAME = f'EC50EC10_invertebrates_withoverlap_predictions_100x_CV_RDKit' #Filename to which results will be saved

In [8]:
runs_df = GetRuns(project=f"ecotoxformer/{PROJECT_NAME}/", sweepid=SWEEP_ID, endpoints=ENDPOINT)

In [9]:
runs_df

Unnamed: 0,summary,config,name,run_ids,sweepid
0,"{'_step': 3682, '_timestamp': 1691580273.35834...","{'lr': 0.0002, 'seed': 45, 'label': 'mgperL', ...",wobbly-sweep-100,d4bioj8u,u1c9loib
1,{'Validation RMSE Loss Normalized': 0.84046807...,"{'lr': 0.0002, 'seed': 50, 'label': 'mgperL', ...",noble-sweep-100,1tslsxkh,u1c9loib
2,{'Validation RMSE Loss Normalized': 0.90158873...,"{'lr': 0.0002, 'seed': 49, 'label': 'mgperL', ...",sandy-sweep-99,1yns1xnv,u1c9loib
3,"{'_timestamp': 1691571497.6975565, 'Learning R...","{'lr': 0.0002, 'seed': 48, 'label': 'mgperL', ...",good-sweep-98,1gmvan2d,u1c9loib
4,{'Validation RMSE Loss Normalized': 0.82821230...,"{'lr': 0.0002, 'seed': 47, 'label': 'mgperL', ...",earthy-sweep-97,6z00xt0o,u1c9loib
...,...,...,...,...,...
95,{'Best Validation Median Loss Normalized': 0.3...,"{'lr': 0.0002, 'seed': 45, 'label': 'mgperL', ...",swift-sweep-5,oicbyak6,u1c9loib
96,{'Training df': {'sha256': '5a135be70275400aee...,"{'lr': 0.0002, 'seed': 43, 'label': 'mgperL', ...",stellar-sweep-3,k6rnxzu9,u1c9loib
97,"{'training batch': 3639, '_wandb': {'runtime':...","{'lr': 0.0002, 'seed': 44, 'label': 'mgperL', ...",young-sweep-4,ndyab7l8,u1c9loib
98,"{'Best Validation Results': {'ncols': 22, 'nro...","{'lr': 0.0002, 'seed': 42, 'label': 'mgperL', ...",youthful-sweep-2,3v9gknqk,u1c9loib


K-Fold CV

In [None]:
concatenated_results = CombineKFoldPredictions(
    wandbrun=wandbrun, 
    runs_df=runs_df,
    project=PROJECT_NAME, 
    version=ARTIFACT_VERSION, 
    artifact_name=ARTIFACT_NAME,
    name=DOWNLOADED_ARTIFACT_NAME,
    save_cls_embeddings=SAVE_CLS_EMBEDDINGS)

Base model runs

In [None]:
concatenated_results = pd.DataFrame()
runs_df = pd.DataFrame(['56vivdmi','dl8u33ui','78lq56v1','128aaus7'], columns=['run_ids'])
runs_df['loss_fun'] = ['L1Loss','L1Loss','MSELoss','MSELoss']
runs_df['base_model'] = ['seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k','seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k']
for i in tqdm(range(5)):
    df = CombinePredictions(wandbrun, runs_df, 'base_model_sweep_RDKit', 'v0', f'BestValidationResults{i+1}', f'Best Validation Results {i+1}')
    df['fold_id'] = i+1
    concatenated_results = pd.concat([concatenated_results, df],ignore_index=True)

Save results

In [None]:
concatenated_results.to_csv(f'../../data/results/{FILENAME}.csv.zip', index=False, compression='zip')

In [None]:
concatenated_results.to_pickle(f'../../data/results/{FILENAME}.pkl.zip', compression='zip')