In [1]:
# !pip install modal

In [2]:
import sys
from modal import App, Image, Secret, web_endpoint

image = Image.debian_slim().pip_install(
    # scraping pkgs
    "datasets",
    "transformers",
    "tqdm",
)

app = App(
    name="paraphraser",
    image=image,    
)

In [3]:
@app.function()
def f(i):
    if i % 2 == 0:
        print("hello", i)
    else:
        print("world", i, file=sys.stderr)

    return i * i

In [4]:
@app.local_entrypoint()
def main():
    # run the function locally
    print(f.local(1000))

    # run the function remotely on Modal
    print(f.remote(1000))

    # run the function in parallel and remotely on Modal
    total = 0
    for ret in f.map(range(20)):
        total += ret

    print(total)

In [5]:
main()

hello 1000
1000000


ExecutionError: Object has not been hydrated and doesn't support lazy hydration. This might happen if an object is defined on a different stub, or if it's on the same stub but it didn't get created because it wasn't defined in global scope.

In [None]:
@app.function(gpu="A100")
def paraphrase_batched(
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=3,
    repetition_penalty=1.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    max_length=32,
    batch_size=512,
    ):
    import json
    from datasets import load_dataset, Dataset
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    from tqdm.auto import tqdm
    tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
    model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)
    split = "train"
    print("Loading dataset")
    dataset = load_dataset("ms_marco", "v2.1", split=split)
    questions = [f'paraphrase: {question}' for question in dataset['query']]
    print("Tokenization")
    input_ids = tokenizer(
        questions,
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)
    print("Generating paraphrases")
    results = []
    for i in tqdm(range(0, len(input_ids), batch_size)):
        batch = input_ids[i:i+batch_size].to(device)
        outputs = model.generate(
            batch, repetition_penalty=repetition_penalty,
            num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
            num_beams=num_beams, num_beam_groups=num_beam_groups,
            max_length=max_length, diversity_penalty=diversity_penalty, do_sample=False
        )
        result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results.extend(result)

        # Write results to a json file as a jsonline every 2 iterations
        if i % 2 == 0:
            with open('results.jsonl', 'w') as f:
                for result in results:
                    f.write(json.dumps(result) + '\n')
            ds = Dataset.from_dict({"query": results})
            ds.push_to_hub("nirantk/msmarco-expansions", token="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")
    return results

a = paraphrase_batched.remote()