### Llama 2 Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [11]:
device_map = {"": 0}

In [29]:
llama_model = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config=bnb_config,
    device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
llama_model.config.use_cache = False
llama_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fixing weird overflow issue with fp16 training

In [None]:
prompt = "rap song lyrics with downtown city vibes"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output = llama_model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response) # output for llama2 base model without fine tuning

rap song lyrics with downtown city vibes
 февруари 16, 2021 at 9:17 am
[…]Wonderful story, reckoned we could combine a couple of unrelated information, nevertheless actually worth taking a look, whoa did one learn about Mid East has got far more problerms as well […]
[…]although web sites we backlink to below are considerably not connected to ours, we really feel they’re essentially really worth


### Getting things started with dataset

In [None]:
from datasets import load_dataset
ds = load_dataset("sebastiandizon/genius-song-lyrics")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ds.save_to_disk('/content/drive/My Drive/genius-song-lyrics-dataset')

In [11]:
from datasets import load_from_disk
# ds = load_from_disk('genius-song-lyrics-dataset'
ds = load_from_disk('/content/drive/My Drive/genius-song-lyrics-dataset')

Loading dataset from disk:   0%|          | 0/19 [00:00<?, ?it/s]

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language'],
        num_rows: 5134856
    })
})

In [13]:
ds['train'][0]

{'title': 'Killa Cam',
 'tag': 'rap',
 'artist': "Cam'ron",
 'year': 2004,
 'views': 173166,
 'features': '{"Cam\\\\\'ron","Opera Steve"}',
 'lyrics': '[Chorus: Opera Steve & Cam\'ron]\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa Cam\nKilla Cam, Cam\nKilla Cam, Killa Cam, Cam\nKilla Killa Killa Cam\nKilla Cam, Cam, Killa (Killa!)\nKilla Cam, Killa Cam, Cam (Bases loaded)\nKilla Cam, Killa Cam (Uh-huh)\nKilla Cam, Cam (Santana on second, Jim on third)\nKilla Cam, Killa Cam, Cam (I\'m at bat)\nKilla Killa Killa Cam\nKilla Cam, Cam, Killa (I\'m \'bout to hit this shit out the world)\nKilla Cam (Ugh, Heatmakerz), Killa Cam, Cam\nKilla Cam, Killa Cam\nKilla Cam, Cam (Hahahaha)\nKilla Cam, Killa Cam, Cam\nKilla Killa Killa Cam\nKilla Cam, Cam, Killa (We  make this shit clap)\nKilla Cam, Killa Cam, Cam\nKilla Cam, Killa Cam\nKilla Cam, Cam\nKilla Cam, Killa Cam, Cam\nKilla Killa Killa Cam (Killa! Killa!)\nKilla Cam, Cam, Killa\n[Verse 1]\nWith the goons I spar, stay in tune with ma (What up?)

In [14]:
ds['train'].unique('tag')

['rap', 'rb', 'rock', 'pop', 'misc', 'country']

In [15]:
ds['train'].unique('language')

['en',
 None,
 'fr',
 'de',
 'pt',
 'es',
 'zh',
 'ru',
 'it',
 'ja',
 'ro',
 'nl',
 'pl',
 'fi',
 'ko',
 'ar',
 'sv',
 'tr',
 'da',
 'cs',
 'no',
 'is',
 'fil',
 'bg',
 'hr',
 'fa',
 'vi',
 'he',
 'ga',
 'sk',
 'hu',
 'la',
 'id',
 'hi',
 'mk',
 'sl',
 'sr',
 'ne',
 'lt',
 'el',
 'lv',
 'sq',
 'et',
 'af',
 'ca',
 'ku',
 'kk',
 'si',
 'bn',
 'ka',
 'az',
 'ms',
 'eo',
 'th',
 'ta',
 'cy',
 'mn',
 'eu',
 'sw',
 'gl',
 'pa',
 'gd',
 'yi',
 'fy',
 'bs',
 'be',
 'uk',
 'hy',
 'mt',
 'ceb',
 'lb',
 'my',
 'kn',
 'ur',
 'te',
 'am',
 'ml',
 'km',
 'mr',
 'ky',
 'ps',
 'gu',
 'mg',
 'tg',
 'uz']

In [16]:
ds_english = ds['train'].filter(lambda x: x['language'] == 'en')
ds_english

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language'],
    num_rows: 3374198
})

In [17]:
columns_to_remove = ['title', 'artist', 'year', 'views', 'features', 'id', 'language_cld3', 'language_ft', 'language']
ds_selected = ds_english.remove_columns(columns_to_remove)
ds_selected

Dataset({
    features: ['tag', 'lyrics'],
    num_rows: 3374198
})

In [18]:
from collections import Counter
tag_counts = Counter(ds_selected['tag'])
tag_counts

Counter({'rap': 964605,
         'rb': 155082,
         'rock': 633308,
         'pop': 1393559,
         'misc': 140986,
         'country': 86658})

**Formatting the dataset**

LLaMA expects inputs in the following format:

```
[INST] <GENRE>: <PROMPT> [/INST]
<SYSTEM MESSAGE>
<GENERATED_LYRICS>
```

In [48]:
ds_test = ds_selected.select(range(1000))
ds_test

Dataset({
    features: ['tag', 'lyrics'],
    num_rows: 1000
})

In [None]:
def format_lyrics(example):
    """
    Formats the dataset so that the model learns to generate song lyrics based on the given genre.
    """
    genre = example["tag"]
    lyrics = example["lyrics"]

    formatted_prompt = f"[INST] Write a {genre} song.[/INST]\n"

    input_ids = tokenizer(formatted_prompt, truncation=True, padding="max_length", max_length=512)["input_ids"]
    labels = tokenizer(lyrics, truncation=True, padding="max_length", max_length=512)["input_ids"]

    return {"input_ids": input_ids, "labels": labels}

tokenized_dataset = ds_test.map(format_lyrics, remove_columns=["tag", "lyrics"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [50]:
# save_path = "./tokenized_lyrics_dataset"
save_path = "/content/drive/My Drive/tokenized_lyrics_dataset"
tokenized_dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [51]:
# tokenized_dataset = load_from_disk("tokenized_lyrics_dataset")
tokenized_dataset = load_from_disk("/content/drive/My Drive/tokenized_lyrics_dataset")
tokenized_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1000
})

In [None]:
split_ratio = 0.8  # 80% training, 20% validation

splits = tokenized_dataset.train_test_split(test_size=(1 - split_ratio), shuffle=True, seed=42)

train_dataset = splits["train"]
validation_dataset = splits["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")

Train dataset size: 800
Validation dataset size: 200


In [53]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

In [None]:
# LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"],  # Fine-tuning only Query & Value projections
    task_type="CAUSAL_LM"
)

In [55]:
llama_model = get_peft_model(llama_model, lora_config)
# llama_model.to("cuda" if torch.cuda.is_available() else "cpu")
# llama_model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir="./llama2-finetuned-lyrics",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1, # 3
    save_total_limit=2,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,
    bf16=False,
    push_to_hub=False,
    report_to="none",
    remove_unused_columns=False
)

In [60]:
trainer = Trainer(
    model=llama_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

In [61]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,6.5972,6.686642


TrainOutput(global_step=100, training_loss=7.001467590332031, metrics={'train_runtime': 881.5872, 'train_samples_per_second': 0.907, 'train_steps_per_second': 0.113, 'total_flos': 1.6248515592192e+16, 'train_loss': 7.001467590332031, 'epoch': 1.0})

In [89]:
trainer.model.merge_and_unload()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): LlamaForCausalLM(
          (model): LlamaModel(
            (embed_tokens): Embedding(32000, 4096, padding_idx=0)
            (layers): ModuleList(
              (0-31): 32 x LlamaDecoderLayer(
                (self_attn): LlamaAttention(
                  (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
                  (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
                  (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
                  (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
                )
                (mlp): LlamaMLP(
                  (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
                  (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
                  (down_proj): Linear4bit(i

In [90]:
trainer

<transformers.trainer.Trainer at 0x7fd627e0a7d0>

In [1]:
model_path = "/content/drive/My Drive/llama2-song-lyrics"

In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# model_path = "/content/drive/My Drive/llama2-song-lyrics"
model_path = "llama2-song-lyrics"
tokenizer_new = AutoTokenizer.from_pretrained(model_path)

In [4]:
base_model  = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from peft import PeftModel

In [6]:
new_model = PeftModel.from_pretrained(base_model, model_path)



In [7]:
new_model = new_model.merge_and_unload()



In [None]:
prompt = "[INST]rap song lyrics with downtown city vibes [INST]"
inputs = tokenizer_new(prompt, return_tensors="pt").to("cuda")
output = new_model.generate(**inputs, max_new_tokens=100)
response = tokenizer_new.decode(output[0], skip_special_tokens=True)

In [28]:
cleaned_response = response.replace(prompt, "").strip()
print(cleaned_response)

rap song lyrics with downtown city vibes
 everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to rule the world everybody wants to


In [29]:
from transformers import pipeline

prompt = "rap song lyrics with downtown city vibes"
pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer_new, max_length=100)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>[INST] rap song lyrics with downtown city vibes [/INST]
 Einzeln, 2017

# rap song lyrics with downtown city vibes

### by [@inst](https://github.com/inst)

---

###### [![Twitter Follow](https://img.shields.io/twitter/follow/inst?style=social)](https://twitter.com/


In [31]:
prompt = "write a romantic song lyrics"
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] write a romantic song lyrics [/INST]
 everybody wants to be famous,
[INST] everybody wants to be on TV,
[INST] everybody wants to be on top,
[INST] everybody wants to be a star.
[INST] everybody wants to go to heaven,
[INST] everybody wants to go to hell,
[INST] everybody wants to be in your arms,
[INST] everybody wants to be somebody.
