In [None]:
import wandb
run = wandb.init()
artifact = run.use_artifact('parambharat/mave/raw_dataset:v0', type='dataset')
artifact_dir = artifact.download()


In [None]:
project("parambharat", "mave").artifact("raw_dataset").membershipForAlias("v1").artifactVersion.file("raw_dataset.table.json")

In [None]:
project("parambharat", "mave").artifact("split_dataset").membershipForAlias("v1").artifactVersion.file("train.table.json")

In [None]:
import json


# utility to preprocess and prepare the dataset.
def prepare_dataset(row):
    paragraphs = row["paragraphs"]
    attributes = row["attributes"]


    completion = {}
    
    pids = []
    for attribute in attributes:
        key = attribute["key"]
        for evidence in attribute["evidences"]:
            pid = evidence['pid']
            source = paragraphs[pid].get('source', pid)
            if source in ["title",]:
                current = {key: evidence['value']}
                if current[key].lower() not in map(lambda x: x.lower(), completion.values()):
                    completion[key] = current[key]
                    pids.append(pid)
    completion["category"] = row["category"]
    completion = " " + json.dumps(completion) + "\n\n###\n\n"
    
    prompt = ""
    for pid in set(pids):
        source = paragraphs[pid]
        prompt+= f"{source.get('text', '')}\n"
    prompt += "==>\n"
    
    return pd.Series({"prompt": prompt, "completion": completion})


# reuse the artifact for the subset we created earlier
wandb.init(project="mave", entity="parambharat")
artifact = wandb.use_artifact('raw_dataset:latest', type="dataset")
subset = artifact.get("raw_dataset")
subset = pd.DataFrame(subset.data, columns=subset.columns)


# split the dataset into train test and validation splits.
train_df, test_df = train_test_split(subset, stratify=subset.category, test_size=0.25)
val_df, test_df = train_test_split(test_df, stratify=test_df.category, test_size=0.5)


train_df = train_df.apply(prepare_dataset, axis=1)
train_df.to_json("prompts_dataset_train.jsonl", lines=True, orient="records")


val_df = test_df.apply(prepare_dataset, axis=1)
val_df.to_json("prompts_dataset_val.jsonl", lines=True, orient="records")


test_df = test_df.apply(prepare_dataset, axis=1)
test_df.to_json("prompts_dataset_test.jsonl", lines=True, orient="records")


# run openai dataset preparation tool.
!openai tools fine_tunes.prepare_data -f prompts_dataset_train.jsonl -q
!openai tools fine_tunes.prepare_data -f prompts_dataset_val.jsonl -q
!openai tools fine_tunes.prepare_data -f prompts_dataset_test.jsonl -q


In [None]:
project("parambharat", "mave").artifact("prepared_dataset") 