In [2]:
# ! pip install ipywidgets

# Creating required folders

In [8]:
import os
os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"  # Needed for running Curator on the GPU

NOTEBOOK_DIR = os.path.abspath("")
DATA_DIR = os.path.join(NOTEBOOK_DIR, "data")
TEMP_DIR = os.path.join(NOTEBOOK_DIR, ".temp")
os.makedirs(DATA_DIR, exist_ok=True)

# Importing all required libraries and packages
# Importing helper functions for data processing from the python scripts in $'helpers'$ folder

In [9]:
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import WordCountFilter
from nemo_curator.modifiers import UnicodeReformatter
from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
from nemo_curator import ScoreFilter, Sequential
from nemo_curator.modules.modify import Modify
from functools import partial
from nemo_curator.filters import RepeatingTopNGramsFilter


# # Importing helper functions
# from helpers.filters import FilterLowScores
# from helpers.docbuilder import download_and_convert_dataset
from helpers.filters import FilterLowScores
from helpers.docbuilder import download_and_convert_dataset
from helpers.modifiers import SectionNumberFormatter, redact_pii


# Setting System Requirements

In [10]:
device = "gpu"
n_workers = 16
client = get_client(device, n_workers=n_workers, set_torch_to_use_rmm=False)

cuDF Spilling is enabled


# Download the data from huggingface and processing it
* Each question and title pair has multiple answers. The answer with highest score is picked and the rest of the answers are discarded.
* The dataset is converted into $DocumentDataset$ format provided by NeMo Curator

In [6]:
dataset_df = download_and_convert_dataset(DATA_DIR)
raw_dataset = DocumentDataset.from_pandas(dataset_df)

Download directory:  /root/ODSC-Hackathon-Repository/data/raw
Downloading Law QA dataset from 'https://huggingface.co/datasets/ymoslem/Law-StackExchange/resolve/main/law-stackexchange-questions-answers.json'...


# Running Data Curation pipeline:
### Steps in the curation process:

1. Unify text encoding to Unicode for the "title" and "question" fields.
    * We used the $UnicodeReformatter$ under $DocumentModifier$ provided in $NeMo Curator$.



2. Apply formatting for section numbers in the "question" field. 
    * We have wrapped custom text cleaning using regex into $DocumentModifier$ provided in $NeMo Curator$.



3. Remove personally identifiable information (PII) like $name of the person, address, phone number, email address, location$ from the "title" and "question" fields.
    * We have used the $PiiModifier$ provided in $NeMo Curator$.



4. Exclude questions with fewer than 15 words.
    * We have manually investigated questions with 10, 15, 20, 30, 40, 50 words and found that questions with 15 or lesser number of words are not contextually complete and meaningful enough.
    * So we have discarded all the questions with 15 or lesser words. We have used $WordCountFilter$ under $ScoreFilter$ provided in $NeMo Curator$.



5. Remove questions with a score below zero.
    * Every question-title pair in the dataset has a score which denotes the upvotes/downvotes received. We have discarded the questions with score less than 0 i.e. questions which have more downvotes than the upvotes.
    * For this, we have utilized $FilterLowScores$ under $ScoreFilter$ provided in $NeMo Curator$.



6. Eliminate questions with excessive repeating n-grams.
    * We decided to remove the repeating n grams as it improves the quality of the data and reduces bias in model training as repetitive language introduces bias in language models by over-emphasizing certain phrases.
    * We have used n=5 (i.e. 5 grams) with the repeating ratio of 0.15.
    * We leveraged $RepeatingTopNGramsFilter$ under $ScoreFilter$ provided in $NeMo Curator$.


In [4]:
def run_curation_pipeline(dataset: DocumentDataset, device: str) -> DocumentDataset:
    print(f"Running curation pipeline on '{device}'...")
    orig_dataset = dataset

    cpu_curation_steps = Sequential(
        [
            # Unify the text encoding to Unicode.
            Modify(UnicodeReformatter(), text_field="title"),
            Modify(UnicodeReformatter(), text_field="question"),
            # Modify(UnicodeReformatter(), text_field="answer"),

            Modify(SectionNumberFormatter(), text_field="question"),
            # Modify(SectionNumberFormatter(), text_field="answer"),

            partial(redact_pii, text_field="title"),
            partial(redact_pii, text_field="question"),
            # partial(redact_pii, text_field="answer"),

            ScoreFilter(
                WordCountFilter(min_words=15),
                text_field="question",
                score_type=int,
            ),
            
            # ScoreFilter(
            #     WordCountFilter(min_words=15),
            #     text_field="answer",
            #     score_type=int,
            # ),

            ScoreFilter(
                FilterLowScores(score_threshold=0),
                text_field="question_score",
                score_type=bool,
            ),

            ScoreFilter(
                RepeatingTopNGramsFilter(n=5, max_repeating_ngram_ratio=0.15),
                text_field="question",
            ),

            # ScoreFilter(
            #     RepeatingTopNGramsFilter(n=5, max_repeating_ngram_ratio=0.15),
            #     text_field="answer",
            # ),
        ]
    )

    # Run the CPU curation steps.
    dataset = cpu_curation_steps(dataset)
    dataset = dataset.persist()
    # Drop the columns that are no longer needed.
    dataset.df = dataset.df.drop(columns=["answer", "answer_score", "question_score"])
    orig_len = len(orig_dataset.df)
    new_len = len(dataset.df)

    print(f"Original dataset length: {orig_len}")
    print(f"New dataset length: {new_len}")

    return dataset

In [8]:
curated_dataset = run_curation_pipeline(raw_dataset, device)

Running curation pipeline on 'gpu'...


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('question', 'float64'))

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2024-10-28 00:40:00 INFO:Loaded recognizer: EmailRecognizer
2024-10-28 

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m262.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
2024-10-28 00:40:13 INFO:Finished downloading model en_core_web_lg
2024-10-28 00:40:15 INFO:Loaded recognizer: AddressRecognizer


Original dataset length: 24343
New dataset length: 21069


# Creating System Prompt
* We created each question-title input in such a way that, there is a system prompt that instructs the LLM what the task is provided the question, title and the list of tags.
* The training data will include the tags associated with the question-title pair as well.

In [11]:
SYSTEM_PROMPT = '''
You are an expert in Law and also in tagging legal questions.
You are provided with a question enclosed in +++++ and it's corresponding title enclosed in >>>>> from the law domain.

You are also provided with a list of all the tags, enclosed in ^^^^^.

Your task is to:
i. Understand the question and it's title.
ii. Pick up the tags that are most appropriate and relevant to the question, strictly from the tags provided to you.
iii. Make sure you return the tags alone without their description.


```
NOTES: All tags must be in lowercase, ordered lexicographically and separated by commas.
```

Your output should be a JSON with the below format:
```
tags : <Put your relevant tags here>
```


>>>>>
Title: {title}
>>>>>


+++++
Question: {question}
+++++

'''

# Colecting the unique list of tags from the entire train data.
* Since each question can have multiple tags associated with it, we have collected all the list of unique tags and passed them into the prompt for the LLM to choose from.

In [12]:
import pandas as pd
tags_df = pd.read_csv("Tags and QA.csv")
tag_desc = list(tags_df['Tag'])

# Formatting the data with the system prompt to be passed during training process

In [13]:
import random
import ast
from tqdm import tqdm

def format_dataset(dataset: DocumentDataset, filename: str, tag_desc, SYSTEM_PROMPT) -> DocumentDataset:

    df = dataset.df.compute()
    has_tags = "tags" in df.columns

    df.reset_index(inplace=True, drop=True)
    # df = add_examples(df, tags_df)

    inputs = []
    outputs = []
    for i in tqdm(range(len(df))):
        try:
            # inputs.append(SYSTEM_PROMPT.format(tag_desc=tag_desc, question=df['question'][i], title=df['title'][i]))
            inputs.append(SYSTEM_PROMPT.format(question=df['question'][i], title=df['title'][i]))
            outputs.append(df["tags"][i] if has_tags else "")
        except:
            print(i)
            pass

    df['input'] = inputs
    df['output'] = outputs

    df["filename"] = filename

    df = df.drop(columns=["title", "question"])

    if has_tags:
        df = df.drop(columns=["tags"])
    
    return DocumentDataset.from_pandas(df)


In [30]:
formatted_dataset = format_dataset(curated_dataset, "law-stackexchange-curated.jsonl", tag_desc, SYSTEM_PROMPT)
print(f"Original dataset columns: {curated_dataset.df.columns}")
print(f"Formatted dataset columns: {formatted_dataset.df.columns}")

100%|██████████| 21069/21069 [00:02<00:00, 7107.03it/s]


Original dataset columns: Index(['filename', 'id', 'title', 'question', 'tags'], dtype='object')
Formatted dataset columns: Index(['filename', 'id', 'input', 'output'], dtype='object')


In [31]:
print(formatted_dataset.df.head()['input'][0])

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.



You are an expert in Law and also in tagging legal questions.
You are provided with a question enclosed in +++++ and it's corresponding title enclosed in >>>>> from the law domain.

You are also provided with a list of all the tags, enclosed in ^^^^^.

Your task is to:
i. Understand the question and it's title.
ii. Pick up the tags that are most appropriate and relevant to the question, strictly from the tags provided to you.
iii. Make sure you return the tags alone without their description.


```
NOTES: All tags must be in lowercase, ordered lexicographically and separated by commas.
```

Your output should be a JSON with the below format:
```
tags : <Put your relevant tags here>
```


List of Tags to choose from:
^^^^^
^^^^^


>>>>>
Title: Why is drunk driving causing accident punished so much worse than just drunk driving?
>>>>>


+++++
Question: When people drink and drive and then cause an accident especially where if someone dies they get years and years in prison but just the 

In [32]:
print(f"Curated dataset columns: {formatted_dataset.df.columns}")
result_fp = os.path.join(DATA_DIR, "curated_dataset")
print()
print(f"Saving curated dataset to '{result_fp}'...")
formatted_dataset.to_json(result_fp, write_to_filename=True)

Curated dataset columns: Index(['filename', 'id', 'input', 'output'], dtype='object')

Saving curated dataset to '/root/ODSC-Hackathon-Repository/data/curated_dataset'...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Writing to disk complete for 1 partitions


# Splitting the data into Train and Validation sets with 95% and 5% ratios respectively

In [33]:
from sklearn.model_selection import train_test_split

VAL_RATIO = 0.05

df = formatted_dataset.df.compute()

# Some sanity checks
assert len(df) > 0, "The dataset is empty."
assert VAL_RATIO >= 0 and VAL_RATIO <= 1, "VAL_RATIO must be between 0 and 1."
val_size = int(len(df) * VAL_RATIO)
output_dir = f"{DATA_DIR}/split"
os.makedirs(output_dir, exist_ok=True)

# Split the data into training and temporary sets
train_df, val_df = train_test_split(df, test_size=val_size, random_state=42)

print(f"Original size: {len(df)}")
print("After splitting:")
print(f"    Train size: {len(train_df)}")
print(f"    Validation size: {len(val_df)}")

train_df["filename"] = "train.jsonl"
val_df["filename"] = "val.jsonl"

DocumentDataset.from_pandas(train_df).to_json(output_dir, write_to_filename=True)
DocumentDataset.from_pandas(val_df).to_json(output_dir, write_to_filename=True)


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Original size: 21069
After splitting:
    Train size: 20016
    Validation size: 1053


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Writing to disk complete for 1 partitions


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Writing to disk complete for 1 partitions


In [34]:
file_path = f"{DATA_DIR}/split/train.jsonl"
import json
data = []

with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))


In [35]:
data[4]

{'filename': 'train.jsonl',
 'id': 'law-stackexchange-qa-94691',
 'output': 'arraignment,arrest,bail,federal-courts,politics'}

# Data processing for the submission

In [14]:
# SUBMISSION DATA PREPARATION
output_dir = f"{DATA_DIR}/split"
os.makedirs(output_dir, exist_ok=True)

submission_ds = "data/submission/evaluation-dataset-verified-for-participants.jsonl"
assert os.path.exists(submission_ds), f"The submission dataset does not exist at '{submission_ds}'"
submission_ds = DocumentDataset.read_json(submission_ds)
submission_ds = format_dataset(submission_ds, "submission.jsonl", tag_desc, SYSTEM_PROMPT)
print("Writing the formatted submission dataset to disk...")
submission_ds.to_json(output_dir, write_to_filename=True)

Reading 1 files


100%|██████████| 5000/5000 [00:00<00:00, 61458.25it/s]


Writing the formatted submission dataset to disk...
Writing to disk complete for 1 partitions


In [15]:
file_path = f"{DATA_DIR}/split/submission.jsonl"
import json
data = []

with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))


In [18]:
data[678]

{'filename': 'submission.jsonl',
 'input': "\nYou are an expert in Law and also in tagging legal questions.\nYou are provided with a question enclosed in +++++ and it's corresponding title enclosed in >>>>> from the law domain.\n\nYou are also provided with a list of all the tags, enclosed in ^^^^^.\n\nYour task is to:\ni. Understand the question and it's title.\nii. Pick up the tags that are most appropriate and relevant to the question, strictly from the tags provided to you.\niii. Make sure you return the tags alone without their description.\n\n\n```\nNOTES: All tags must be in lowercase, ordered lexicographically and separated by commas.\n```\n\nYour output should be a JSON with the below format:\n```\ntags : <Put your relevant tags here>\n```\n\n\n>>>>>\nTitle: Navigating UK Residency Requirements for Foreign Nationals\n>>>>>\n\n\n+++++\nQuestion: What are the implications of switching from a work-sponsored visa to a family-based visa on the path to obtaining permanent residency 

In [19]:
client.close()
exit(0)