## Importing the Dependencies

In [None]:
!pip install datasets
!pip install jsonlines

In [None]:
from datasets import load_dataset

## Downloading the Dataset

In [None]:
dataset = load_dataset('ttbui/alpaca_data_with_html_output')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 52530
    })
})

## Setting up Prompt for the Llama2 Model
#### So this is the default Prompt Template that is used in Llama2 Model.

In [None]:
llama2_prompt_with_input = """<s>[INST] <<SYS>> {instruction} <</SYS>> {input} [/INST] {output} </s>"""

In [None]:
llama2_prompt_without_input = """<s>[INST] <<SYS>> {instruction} <</SYS>> [/INST] {output} </s>"""

In [None]:
len(dataset['train'])

52530

In [None]:
import itertools
n = 2000
examples = list(itertools.islice(dataset['train'], n))

In [None]:
len(examples)

2000

In [None]:
print(examples[0]['instruction'])
print(examples[0]['input'])
print(examples[0]['output'])

Write an HTML template to display a greeting message. The message should include the name of the user who visits the website.
username = "John"
<html>
	<head>
		<title>Greeting Message</title>
	</head>
	<body>
		<h1>Hello, {{ username }}!</h1>
	</body>
</html>


## Formatting the Prompt according to the Dataset

In [None]:
transformed_dataset = []
for i in examples:
  if not i['input']:
    processed_prompt = llama2_prompt_without_input.format(instruction = i['instruction'], output = i['output'])
  else:
    processed_prompt = llama2_prompt_with_input.format(instruction = i['instruction'], input = i['input'], output = i['output'])

  transformed_dataset.append({'text': processed_prompt})

In [None]:
print(transformed_dataset[0]['text'])

<s>[INST] <<SYS>> Write an HTML template to display a greeting message. The message should include the name of the user who visits the website. <</SYS>> username = "John" [/INST] <html>
	<head>
		<title>Greeting Message</title>
	</head>
	<body>
		<h1>Hello, {{ username }}!</h1>
	</body>
</html> </s>


### Saving the Preprocessed Dataset

In [None]:
import jsonlines
with jsonlines.open(f'html_dataset.jsonl', 'w') as writer:
  writer.write_all(transformed_dataset)

In [None]:
type(transformed_dataset)

list

## Loading the Preprocessed Dataset

In [None]:
filename = 'html_dataset.jsonl'

In [None]:
html_dataset_for_llama2_finetuning = load_dataset("json", data_files = filename)

In [None]:
html_dataset_for_llama2_finetuning

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2000
    })
})

## Pushing the Dataset to the HuggingFace Hub

In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
html_dataset_for_llama2_finetuning.push_to_hub('PiyushLavaniya/HTML_Dataset_for_LLama2_Finetuning')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Refer to the Following link, if you want to know how the Dataset looks like:
https://huggingface.co/datasets/PiyushLavaniya/HTML_Dataset_for_LLama2_Finetuning