# ReproLab Demo

Welcome to ReproLab! This extension helps you make your research more reproducible.

## Features

- **Create Experiments**: Automatically save immutable snapshots of your code under `git` tags to preserve the **exact code and outputs**
- **Manage Dependencies**: Automatically gather and pin **exact package versions**, so that others can set up your environment with one command
- **Cache Data**: Call external API/load manually dataset only once, caching function will handle the rest
- **Archive Data**: Caching function can also preserve the compressed data in *AWS S3*, so you always know what data was used and reduce the API calls
- **Publishing guide**: The reproducibility checklist & automated generation of reproducability package make publishing to platforms such as Zenodo very easy

## Getting Started

1. Use the sidebar to view ReproLab features
2. Create virtual environment and pin your dependencies, go to reprolab section `Create reproducible environment` 
3. Create an experiment to save your current state, go to reprolab section `Create experiment`
4. Archive your data for long-term storage, go to reprolab section `Demo` and play around with it.
5. Publish your work when ready, remember to use reproducability checklist from the section `Reproducibility Checklist`

## Example Usage of persistio decorator

To cache and archive the datasets you use, both from local files and APIs we developed a simple decorator that put over your function that gets the datasets caches the file both locally and in the cloud so that the dataset you use is archived and the number of calls to external APIs is minimal and you don't need to keep the file around after you run it once.

Here is an example using one of NASA open APIs. If you want to test it out yourself, you can copy the code, but you need to provide bucket name and access and secret key in the left-hand panel using the `AWS S3 Configuration` section.

```python
import requests
import pandas as pd
from io import StringIO

# The two lines below is all that you need to add
from reprolab.experiment import persistio
@persistio()
def get_exoplanets_data_from_nasa():
    url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync"

    query = """
    SELECT TOP 10
        pl_name AS planet_name,
        hostname AS host_star,
        pl_orbper AS orbital_period_days,
        pl_rade AS planet_radius_earth,
        disc_year AS discovery_year
    FROM
        ps
    WHERE
        default_flag = 1
    """

    params = {
        "query": query,
        "format": "csv"
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        df = pd.read_csv(StringIO(response.text))
        
        print(df)
        
    else:
        print(f"Error: {response.status_code} - {response.text}")
    return df

exoplanets_data = get_exoplanets_data_from_nasa()
```

If you run this cell twice you will notice from the logs that the second time file was read from the compressed file in the cache. If you were to lose access to local cache (e.g. by pulling the repository using different device) `persistio` would fetch the data from the cloud archive.


For more information, visit our [documentation](https://github.com/your-repo/reprolab). 


In [2]:
import subprocess
import re

def list_and_sort_git_tags(repo_path='.'):
    try:
        result = subprocess.run(
            ['git', '-C', repo_path, 'tag'],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        tags = result.stdout.strip().split('\n')
        tags = [tag for tag in tags if tag]

        # Convert tags like v1.2.3 to 123 for sorting
        def tag_to_sort_key(tag):
            match = re.match(r'v(\d+)\.(\d+)\.(\d+)', tag)
            if match:
                return int(''.join(match.groups()))
            return -1  # Push malformed tags to the end

        sorted_tags = sorted(tags, key=tag_to_sort_key, reverse=True)
        return sorted_tags
    except subprocess.CalledProcessError as e:
        print(f"Error listing tags: {e.stderr}")
        return []

# Example usage
sorted_tags = list_and_sort_git_tags()
print(sorted_tags)


['v1.5.0', 'v1.4.0', 'v1.3.0', 'v1.2.0', 'v1.1.0', 'v1.0.0']


In [5]:
import subprocess
import os
import shutil

def zip_git_tag(tag, repo_path='.', output_zip_path=None):
    try:
        if output_zip_path is None:
            output_zip_path = f'{tag}.zip'
        
        # Create a temporary checkout of the tag in a separate directory
        temp_dir = os.path.join(repo_path, f'__temp_checkout_{tag}')
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        os.makedirs(temp_dir)

        # Use git archive to export the tag into a zip
        subprocess.run(
            ['git', '-C', repo_path, 'archive', '--format=zip', '--output', output_zip_path, tag],
            check=True
        )
        print(f"Created zip archive of tag '{tag}' at {output_zip_path}")
        return True

    except subprocess.CalledProcessError as e:
        print(f"Git error: {e.stderr}")
        return False
    except Exception as e:
        print(f"Unexpected error: {e}")
        return False

# Example usage
zip_git_tag('v1.0.0', repo_path='.', output_zip_path='v1.0.0.zip')


Created zip archive of tag 'v1.0.0' at v1.0.0.zip


True

In [8]:
import os
import yaml
import datetime

def persist_metadata_for_current_notebook(cell_hash, code_origin, bucket_name, notebook_name):
    try:
        yaml_filename = f"{notebook_name}_persistio_archive.yaml"
        now_iso = datetime.datetime.now(datetime.UTC)

        if os.path.exists(yaml_filename):
            with open(yaml_filename, 'r') as f:
                metadata = yaml.safe_load(f) or {}
        else:
            metadata = {}

        metadata['jupyter_notebook'] = notebook_name
        metadata['last_executed'] = now_iso

        if 'creation_data' not in metadata:
            metadata['creation_data'] = now_iso

        if 'bucket_name' not in metadata:
            metadata['bucket_name'] = bucket_name

        if 'cells_instrumented' not in metadata:
            metadata['cells_instrumented'] = []

        existing = next((cell for cell in metadata['cells_instrumented'] if cell['hash'] == cell_hash), None)
        if existing:
            existing['code_origin'] = code_origin
        else:
            metadata['cells_instrumented'].append({
                'hash': cell_hash,
                'code_origin': code_origin
            })

        with open(yaml_filename, 'w') as f:
            yaml.dump(metadata, f, sort_keys=False)

        print(f"✅ Metadata written to {yaml_filename}")
    except Exception as e:
        print(f"❌ Error persisting metadata: {e}")

persist_metadata_for_current_notebook('123', 'testing', 'bucket2', 'zenodo_section.ipynb')

✅ Metadata written to zenodo_section.ipynb_persistio_archive.yaml


In [12]:
import os
import glob

def get_last_changed_notebook():
    """
    Returns the name of the most recently modified Jupyter notebook (.ipynb) file 
    in the current directory.
    """
    try:
        # Find all .ipynb files in the current directory
        notebook_files = glob.glob('*.ipynb')
        if not notebook_files:
            raise RuntimeError("No .ipynb files found in the current directory")
        
        # Get the most recently modified notebook
        latest_notebook = max(notebook_files, key=os.path.getmtime)
        return latest_notebook
    except Exception as e:
        raise RuntimeError(f"Error finding last changed notebook: {str(e)}")
get_last_changed_notebook()

'zenodo_section.ipynb'

In [13]:
import requests
import pandas as pd
from io import StringIO

# The two lines below is all that you need to add
from reprolab.experiment import persistio
@persistio()
def get_exoplanets_data_from_nasa():
    url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync"

    query = """
    SELECT TOP 10
        pl_name AS planet_name,
        hostname AS host_star,
        pl_orbper AS orbital_period_days,
        pl_rade AS planet_radius_earth,
        disc_year AS discovery_year
    FROM
        ps
    WHERE
        default_flag = 1
    """

    params = {
        "query": query,
        "format": "csv"
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        df = pd.read_csv(StringIO(response.text))
        
        print(df)
        
    else:
        print(f"Error: {response.status_code} - {response.text}")
    return df

exoplanets_data = get_exoplanets_data_from_nasa()


[persistio] Function: get_exoplanets_data_from_nasa
[persistio] Hash: ca840447667cb2059aa83ed68ec9e995
[persistio] Attempting to load from local cache...
[persistio] Successfully loaded from local cache!
