# Download artifacts/data from wandb for figure generation

**Purpose:** This script is used to download artifacts from weights and biases for retrieving results for the 10x10 K-fold cross-validations and the chemberta version and loss-function sweeps.

**Dependency:** `hyperparameter_sweep.ipynb`, `Kfold_crossvalidation_sweep.ipynb`. This script requires results being added to weights and biases. The chemberta version and loss function results (`hyperparameter_sweep.ipynb`) and the 10x10 K-fold cross-validation (`Kfold_crossvalidation_sweep.ipynb`).

**Consecutive scripts:** After running this script the following scripts may be executed. `generate_figures_for_publication.ipynb`

## Imports

In [11]:
import pandas as pd
import wandb
import json
from tqdm.notebook import tqdm

## Download

In [2]:
def GetRuns(project, sweepid, endpoints):

    api = wandb.Api()

    # Project is specified by <entity/project-name>
    runs = api.sweep(project+'/'+sweepid).runs

    summary_list, config_list, name_list, ids = [], [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files

        if run.config['endpoints'] == endpoints:

            summary_list.append(run.summary._json_dict)

            # .config contains the hyperparameters.
            #  We remove special values that start with _.
            config_list.append(
                {k: v for k,v in run.config.items()
                if not k.startswith('_')})

            # .name is the human-readable name of the run.
            name_list.append(run.name)
            ids.append(run.id)

    return pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list,
        "run_ids": ids,
        'sweepid': sweepid
        })

In [3]:
def LoadArtifact(wandbrun, runid, entity, project, artifact_name, fname, version, type):
    artifact = wandbrun.use_artifact(f'{entity}/{project}/run-{runid}-{artifact_name}:{version}', type=type)
    artifact_dir = artifact.download()
    jsonfile = json.load(open(f'{artifact_dir}/{fname}.table.json'))

    cols = jsonfile['columns']
    data = jsonfile['data']
    df = pd.DataFrame(data=data, columns=cols)

    columns=['internal_id', 'Conc_sign', 'species_group', 'Pubchem_CID', 'xlogp', 'mw', 'Canonical_SMILES','Lineage','OneHotEnc_effect','OneHotEnc_endpoint']
    
    for col in columns:
        try:
            df.drop(columns=col, inplace=True)
        except:
            pass


    return df

In [4]:
def CombineKFoldPredictions(wandbrun, runs_df, project, version, artifact_name, name):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
    
    df[['seed', 'fold_id']] = [runs_df.config[0]['seed'], runs_df.config[0]['fold_id']]

    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')

        df2[['seed', 'fold_id']] = [runs_df.config[i]['seed'], runs_df.config[i]['fold_id']]

        df = pd.concat([df, df2], ignore_index=True)

    return df

def CombinePredictions(wandbrun, runs_df, project, version, artifact_name, name):

    df = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[0], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
    df[['base_model', 'loss_fun']] = [runs_df['base_model'][0], runs_df['loss_fun'][0]]
    for i in tqdm(range(1,len(runs_df),1)):
        df2 = LoadArtifact(
            wandbrun = wandbrun, 
            runid= runs_df.run_ids[i], 
            entity = 'ecotoxformer', 
            project = project, 
            artifact_name = artifact_name, 
            fname = name,
            version = version,
            type = 'run_table')
        df2[['base_model', 'loss_fun']] = [runs_df['base_model'][i], runs_df['loss_fun'][i]]
        df = pd.concat([df, df2], ignore_index=True)

    return df

In [5]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstyrbjornkall[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
wandbrun = wandb.init(project="artifacts-analysis", job_type='tmp_run')

In [39]:
PROJECT_NAME = 'Judson'
SWEEP_ID = 'crnn65md'
ENDPOINT = ['EC10','NOEC']
ARTIFACT_NAME = 'BestValidationResults'
DOWNLOADED_ARTIFACT_NAME = 'Best Validation Results' #Split ARTIFACT_NAME by capital letters
ARTIFACT_VERSION = 'v0'
FILENAME = f'Judson_{ENDPOINT[0]}_results' #Filename to which results will be saved

In [40]:
runs_df = GetRuns(project=f"ecotoxformer/{PROJECT_NAME}/", sweepid=SWEEP_ID, endpoints=ENDPOINT)

In [41]:
runs_df

Unnamed: 0,summary,config,name,run_ids,sweepid
0,"{'Training Median Loss': 0.22910571098327637, ...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",neat-sweep-10,hm44g2pg,crnn65md
1,"{'_runtime': 218.1261703968048, 'x-axis 3, 2, ...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",woven-sweep-12,al1qkavt,crnn65md
2,"{'Training Mean Loss': 0.4257524013519287, 'Va...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",silvery-sweep-11,yh96jgao,crnn65md
3,"{'_wandb': {'runtime': 238}, 'Validation data'...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",elated-sweep-10,8o5jnag6,crnn65md
4,"{'_timestamp': 1676292325.0823796, 'Learning R...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",bumbling-sweep-9,gd7j1y14,crnn65md
5,"{'_runtime': 222.8538691997528, 'Training df':...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",easy-sweep-8,kbw8wdn4,crnn65md
6,"{'Learning Rate': 0, 'Training data': {'_lates...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",olive-sweep-7,l4mtnj3n,crnn65md
7,"{'Validation data': {'size': 78250, '_type': '...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",resilient-sweep-6,rgt8dfat,crnn65md
8,"{'Validation Median Loss': 0.8560799956321716,...","{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",helpful-sweep-5,n4itmrow,crnn65md
9,{'Test Mean Loss Normalized': 0.77482743675096...,"{'lr': 0.0005, 'seed': 41, 'label': 'COMBINED_...",splendid-sweep-4,inj5ecei,crnn65md


K-Fold CV

In [42]:
concatenated_results = CombineKFoldPredictions(
    wandbrun=wandbrun, 
    runs_df=runs_df,
    project=PROJECT_NAME, 
    version=ARTIFACT_VERSION, 
    artifact_name=ARTIFACT_NAME,
    name=DOWNLOADED_ARTIFACT_NAME)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


  0%|          | 0/9 [00:00<?, ?it/s]

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


Base model runs

In [None]:
concatenated_results = pd.DataFrame()
runs_df = pd.DataFrame(['56vivdmi','dl8u33ui','78lq56v1','128aaus7'], columns=['run_ids'])
runs_df['loss_fun'] = ['L1Loss','L1Loss','MSELoss','MSELoss']
runs_df['base_model'] = ['seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k','seyonec/SMILES_tokenized_PubChem_shard00_160k','seyonec/PubChem10M_SMILES_BPE_450k']
for i in tqdm(range(5)):
    df = CombinePredictions(wandbrun, runs_df, 'base_model_sweep_RDKit', 'v0', f'BestValidationResults{i+1}', f'Best Validation Results {i+1}')
    df['fold_id'] = i+1
    concatenated_results = pd.concat([concatenated_results, df],ignore_index=True)

Save results

In [16]:
concatenated_results.to_csv(f'../../data/results/{FILENAME}.csv.zip', index=False, compression='zip')

In [43]:
concatenated_results.to_pickle(f'../../data/results/{FILENAME}.pkl.zip', compression='zip')