## Explore the dataset for summarization task

In [1]:
import datasets

In [None]:
summarization_dataset = datasets.load_dataset('EdinburghNLP/xsum', trust_remote_code=True, split="train[:10%]")
summarization_dataset

In [None]:
print(f"\nDocument:\n{summarization_dataset['document'][0]}")
print(f"\nSummary:\n{summarization_dataset['summary'][0]}")

## torchtune built-in recipes and configs

In [None]:
! tune ls

## Finetune Llama 3 for summarization tasks

### Downloading LLaMa3.1-8B-Instruct model

In [None]:
! tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf-token YOUR_HF_TOKEN

### Torchtune fine-tuning by copy modifying an existing config file using `tune cp`

In [6]:
# ! tune cp llama3_1/8B_lora my_llama3_1_custom_config.yaml

In [None]:
# updated the whole dataset argument with:

"""
dataset:
  _component_: torchtune.datasets.instruct_dataset
  column_map:
    dialogue: document
    output: summary
  max_seq_len: 3072
  packed: false
  source: EdinburghNLP/xsum
  split: train[:1000]
  template: torchtune.data.SummarizeTemplate
  train_on_input: false
  trust_remote_code: true
"""

### Finetune llama3.1-8B for summarization tasks 

#### a) Finetuning using the configuration file

In [None]:
%%time
! tune run --nproc_per_node 8 lora_finetune_distributed --config my_llama3_1_custom_config.yaml

#### b) Finetuning using the command-line overrides

In [None]:
%%time
! tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_1/8B_lora \
dataset=torchtune.datasets.instruct_dataset \
dataset.source=EdinburghNLP/xsum \
dataset.split=train[:2000] \
dataset.max_seq_len=2048 \
dataset.template=torchtune.data.SummarizeTemplate \
dataset.column_map.dialogue=document \
dataset.column_map.output=summary \
dataset.trust_remote_code=True \
dataset.packed=False \
dataset.train_on_input=False \
epochs=10


### Testing the fine-tuned model with `tune run generate`

#### a) Making a copy of the generation file

In [None]:
# ! tune cp generation ./my_llama3_1_custom_generation_config.yaml

In [None]:
# update checkpoint argument with the following

"""
checkpointer:
  _component_: torchtune.utils.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Meta-Llama-3.1-8B-Instruct/
  checkpoint_files: [
    hf_model_0001_9.pt,
    hf_model_0002_9.pt,
    hf_model_0003_9.pt,
    hf_model_0004_9.pt,
  ]
"""

#### b) Run for inference

In [None]:
! tune run generate --config ./my_llama3_1_custom_generation_config.yaml

## Evaluating scalability on multiples GPUs with Torchtune's distributed training

In [None]:
%%time
! tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_1/8B_lora \
dataset=torchtune.datasets.instruct_dataset \
dataset.source=EdinburghNLP/xsum \
dataset.split=train[:2000] \
dataset.max_seq_len=2048 \
dataset.template=torchtune.data.SummarizeTemplate \
dataset.column_map.dialogue=document \
dataset.column_map.output=summary \
dataset.trust_remote_code=True \
dataset.packed=False \
dataset.train_on_input=False \
epochs=1

## Appendix

In [None]:
import plotly.graph_objects as go

# Sample data
x_values = [2, 4, 6, 8]
y_values = [1216, 792, 662, 527]  # Replace with actual runtime values

# Create the bar chart
fig = go.Figure(data=[
    go.Bar(x=x_values, y=y_values)
])

# Update layout
fig.update_layout(
    title='Runtime for Fine-Tuning Task',
    xaxis_title='Number of GPUs',
    yaxis_title='Runtime (seconds)',
    template='plotly_white',  # Using the minimal template
    width=600,
    height=600,
)

# Show the figure
fig.show()
