In [1]:
!pip install -U transformers bitsandbytes einops accelerate peft datasets wandb

Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [

In [3]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, set_seed

# set seed
set_seed(42)

# Load model
modelpath = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.float16,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    lora_dropout=0.1,
    bias="none",
    modules_to_save = ["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM"
)

# Add adapters to model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = get_peft_model(model, lora_config)
model.config.use_cache = False

In [None]:
from datasets import load_dataset

dataset = load_dataset("flytech/python-codes-25k")
dataset = dataset["train"].train_test_split(test_size=0.1)

In [7]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("flytech/python-codes-25k")

dataset_shuffled = dataset["train"].shuffle(seed=42)

# Calculate 5% of the dataset size
subset_size = int(0.05 * len(dataset_shuffled))

# Select the first 5% of the data
subset = dataset_shuffled.select(range(subset_size))

# Split the 5% subset into training and testing (80-20 split within the 5%)
train_test_split = subset.train_test_split(test_size=0.2)

print(f"Training set size: {len(train_test_split['train'])}")
print(f"Testing set size: {len(train_test_split['test'])}")



Training set size: 1984
Testing set size: 497


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'text', 'instruction', 'input'],
        num_rows: 49626
    })
})

In [9]:
dataset["train"][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...'}

In [19]:
from transformers import TrainingArguments, Trainer

bs=1     # batch size
eval_bs=4     # eval. batch size
ga_steps=16  # gradient acc. steps
epochs=20
lr=0.00002



args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=eval_bs,
    evaluation_strategy="steps",
    logging_steps=1,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go nan with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    fp16=True,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our new model
prompt = "Write a paragrah about Nelson Mandela?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Write a paragrah about Nelson Mandela? [/INST]  Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was released in 1990 after international pressure. In 1994, he became the first black President of South Africa, serving until 1999. Mandela's presidency was marked by efforts to heal the country'


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Write a python code to determine if a number is prime?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Write a python code to determine if a number is prime? [/INST]  Sure! Here's a Python code to determine if a number is prime:

def is_prime(n):
    if n < 2:
        return False
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return True

# Example usage:
n = 23
print(is_prime(n))
```
This code uses a simple algorithm to check if a number is prime. It starts by checking if the number is less than 2, and then iterates from 2 to the square root of the number, checking if it is divisible by any of those numbers. If it's not divisible by any of those numbers, it's prime.

Note that this code


In [None]:
!pip install sacrebleu rouge-score bert-score

In [None]:
from datasets import load_metric
import sacrebleu
from rouge_score import rouge_scorer
from bert_score import BERTScorer


def evaluate(predictions, references):
    metrics = {
        "perplexity": None,
        "BLEU": sacrebleu.corpus_bleu(predictions, [references]).score,
        "ROUGE-L": load_metric("rouge").compute(predictions=predictions, references=references, use_stemmer=True)["rougeL"].mid.fmeasure,
        "BERTScore": BERTScorer(lang="en").score(references, predictions)[2].mean().item(),

    }
    return metrics


predictions = ["Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was release "]
references = ["Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was release"]
metrics = evaluate(predictions, references)
print(metrics)


In [None]:
from transformers import pipeline
import numpy as np

# Sample prompt to generate text
prompt = "Write a paragrah about Nelson Mandela :"

# Define hyperparameters to test
top_k_values = [20, 40, 60, 80]
beam_sizes = [1, 5, 10, 20]
temperatures = [0.5, 0.7, 1.0, 1.5]

# Initialize the model
model_name = "microsoft/phi-2"
generator = pipeline("text-generation", model=model_name)

# Function to generate text with given hyperparameters
def generate_text(prompt, top_k, beam_size, temperature):
    return generator(prompt, max_length=50, top_k=top_k, num_beams=beam_size, temperature=temperature, do_sample=True if top_k > 0 else False)[0]["generated_text"]

# Function to simulate evaluation
def evaluate_text(text):
    return np.random.rand()

# Experiment and collect results
results = []
for top_k in top_k_values:
    for beam_size in beam_sizes:
        for temperature in temperatures:
            generated_text = generate_text(prompt, top_k, beam_size, temperature)
            score = evaluate_text(generated_text)
            results.append(((top_k, beam_size, temperature), score, generated_text[:50]))  # Storing the first 50 chars for brevity

# Display results
for params, score, sample_text in sorted(results, key=lambda x: x[1], reverse=True):  # Sorting by score for demonstration
    print(f"Params (top_k, beam_size, temperature): {params} - Score: {score:.2f} - Sample Text: {sample_text}")
