In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 guardrail-ml==0.0.12 tensorboard
!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q unstructured["local-inference"]==0.7.4 pillow

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from guardrail.client import (
    run_metrics,
    run_simple_metrics,
    create_dataset)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Used for multi-gpu
local_rank = -1
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
weight_decay = 0.001
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
max_seq_length = None

# The model that you want to train from the Hugging Face hub
model_name = "stabilityai/stablelm-3b-4e1t"

# Fine-tuned model name
new_model = "stabilityai/stablelm-3b-4e1t-Finetuning"

# The instruction dataset to use
dataset_name = "atasoglu/databricks-dolly-15k-tr"

# Activate 4-bit precision base model loading
use_4bit = True

# Activate nested quantization for 4-bit base models
use_nested_quant = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Number of training epochs
num_train_epochs = 2

# Enable fp16 training, (bf16 to True with an A100)
fp16 = False

# Enable bf16 training
bf16 = False

# Use packing dataset creating
packing = False

# Enable gradient checkpointing
gradient_checkpointing = True

# Optimizer to use, original is paged_adamw_32bit
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)
lr_scheduler_type = "cosine"

# Number of optimizer update steps, 10K original, 20 for demo purposes
max_steps = -1

# Fraction of steps to do a warmup for
warmup_ratio = 0.03

# Group sequences into batches with same length (saves memory and speeds up training considerably)
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 10

# Log every X updates steps
logging_steps = 1

# The output directory where the model predictions and checkpoints will be written
output_dir = "./results"

# Load the entire model on the GPU 0
device_map = {"": 0}

# Visualize training
report_to = "tensorboard"

# Tensorboard logs
tb_log_dir = "./results/logs"

In [None]:
def load_model(model_name):
    # Load tokenizer and model with QLoRA configuration
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Load LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"] ,
    )

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer, peft_config

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model, tokenizer, peft_config = load_model(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

Loading stabilityai/stablelm-3b-4e1t requires to execute some code in that repo, you can inspect the content of the repository at https://hf.co/stabilityai/stablelm-3b-4e1t. You can dismiss this prompt by passing `trust_remote_code=True`.
Do you accept? [y/N] y


Downloading (…)on_stablelm_epoch.py:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/stabilityai/stablelm-3b-4e1t:
- configuration_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading stabilityai/stablelm-3b-4e1t requires to execute some code in that repo, you can inspect the content of the repository at https://hf.co/stabilityai/stablelm-3b-4e1t. You can dismiss this prompt by passing `trust_remote_code=True`.
Do you accept? [y/N] y


Downloading (…)ng_stablelm_epoch.py:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/stabilityai/stablelm-3b-4e1t:
- modeling_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading model.safetensors:   0%|          | 0.00/5.59G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Prompt1

In [None]:
def format(sample):
    instruction = f"<s>[INST] {sample['instruction']}"
    context = f"Here's some context: {sample['context']}" if (sample["context"])=="null" else None
    response = f" [/INST] {sample['response']}"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format(sample)}{tokenizer.eos_token}"
    return sample



Prompt2

In [None]:
def format(sample):
    instruction = f"Human: {sample['instruction']}"
    context = f"Assistant: {sample['response']}" if (sample["context"])=="null" else None
    # response = f" [/INST] {sample['response']}"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format(sample)}{tokenizer.eos_token}"
    return sample


local Dataset


In [None]:
from datasets import load_dataset, Dataset

dataset = load_dataset("json", data_files="/content/deneme.json", split="train")
dataset

HuggingFace


In [None]:
# apply prompt template per sample
dataset = load_dataset("atasoglu/databricks-dolly-15k-tr", split="train")
dataset

Downloading readme:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15014
})

In [None]:

# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

In [None]:
dataset_shuffled

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15014
})

In [None]:
# Select the first 50 rows from the shuffled dataset, comment if you want 15k
# dataset = dataset_shuffled.select(range(15000))

dataset = dataset_shuffled.map(template_dataset, remove_columns=list(dataset.features))
dataset

Map:   0%|          | 0/15014 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 15014
})

In [None]:
dataset[0]

{'text': '<s>[INST] Uzaylılar dünyayı en son ne zaman ziyaret etti? [/INST] Uzaylıların dünyayı ziyaret ettiğine dair hiçbir kanıt yok. UFO gözlemleri, garip atmosferik modeller, o dönemde insan yeteneklerinin dışında özellikler gösteren eski binalar gibi birçok iddia var. Ancak bunların hepsi çürütüldü ve uzaylı ziyaretine dair inandırıcı bir kanıt yok.</s>'}

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

In [None]:
generator("[INST] Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire? [/INST]", max_length=1024)



[{'generated_text': '[INST] Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire? [/INST]\n sierp 2019\n[INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST]\n[INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire?\n[INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST]\n[INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST] [INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST]\n[INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST] [INST] Who was the first person to be killed by a White Walker in the series A Song of Ice and Fire? [/INST] [INST] Who was the first person to be killed by a White Walker in 

In [None]:
'''
### You can try with your own datasets as well
dataset = load_dataset("AlexanderDoria/novel17_test", split="train")
dataset_eval = load_dataset("AlexanderDoria/novel17_test", split="test")

dataset = load_dataset("json", data_files="dolly_llama_formatted_v2 (1).jsonl", split="train")
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
'''

'\n### You can try with your own datasets as well\ndataset = load_dataset("AlexanderDoria/novel17_test", split="train")\ndataset_eval = load_dataset("AlexanderDoria/novel17_test", split="test")\n\ndataset = load_dataset("json", data_files="dolly_llama_formatted_v2 (1).jsonl", split="train")\ndataset = dataset.map(template_dataset, remove_columns=list(dataset.features))\n'

In [None]:
# dataset = load_dataset("mlabonne/guanaco-llama2-1k", split="train")
# dataset_shuffled = dataset.shuffle(seed=42)

# # Select the first 50 rows from the shuffled dataset, comment if you want 15k
# dataset = dataset_shuffled.select(range(100))
# dataset

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 100
})

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=500,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)





Map:   0%|          | 0/15014 [00:00<?, ? examples/s]

In [None]:
trainer.train()
# trainer.model.save_pretrained(output_dir)

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.9309
2,2.187
3,2.2434
4,2.103
5,2.0285
6,2.1584
7,2.0893
8,2.1699
9,1.9919
10,2.1426


KeyboardInterrupt: ignored

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/logs

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
dataset[0]

{'text': '<s>[INST] Uzaylılar dünyayı en son ne zaman ziyaret etti? [/INST] Uzaylıların dünyayı ziyaret ettiğine dair hiçbir kanıt yok. UFO gözlemleri, garip atmosferik modeller, o dönemde insan yeteneklerinin dışında özellikler gösteren eski binalar gibi birçok iddia var. Ancak bunların hepsi çürütüldü ve uzaylı ziyaretine dair inandırıcı bir kanıt yok.</s>'}

In [None]:
text = "Çıkmaz anında daha fazla taşım olursa kazandım mı? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Çıkmaz anında daha fazla taşım olursa kazandım mı? Çıkmaz anında daha fazla taşım olursa kazandım mı?
 Hinweis: Die Anzahl der Züge, die Sie in der ersten Runde spielen können, hängt von der Anzahl der Spieler ab.
Hinweis: Die Anzahl der Züge, die Sie in der ersten Runde spielen können, hängt von der Anzahl der Spieler ab.
Hinweis: Die An


In [None]:
text = "John Moses Browning kimdi? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

John Moses Browning kimdi? 1911
 nobody knows.
I'm not sure if this is the right place to post this, but I'm looking for a 1911 that is a little different. I'm looking for a 1911 that is a little different. I'm looking for a 1911 that is a little different. I'm looking for a 1911 that is a little different. I'm looking for a 


In [None]:
text = "Tek çocuk politikası nedir? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Tek çocuk politikası nedir? İlkokul çocuklarının politikası nedir?
 Hinweis: Die Anzahl der Antworten ist begrenzt.
Tek çocuk politikası nedir? İlkokul çocuklarının politikası nedir?
Tek çocuk politikası nedir? İlkokul çocuklarını


In [None]:
text = "Uzaylılar dünyayı en son ne zaman ziyaret etti? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Uzaylılar dünyayı en son ne zaman ziyaret etti? İşte 2019 yılında
 Hinweis: Die Anzahl der Besucher ist nicht die Anzahl der Besucher, die die Webseite besucht haben. Die Anzahl der Besucher ist die Anzahl der Besucher, die die Webseite besucht haben und die Webseite besucht haben.
Die Anzahl der Besucher ist die Anzahl der Besucher, die die Webseite besucht haben und die Web


format2 cevap

deneme


In [None]:
text = "Çıkmaz anında daha fazla taşım olursa kazandım mı? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Çıkmaz anında daha fazla taşım olursa kazandım mı? 
İşte bir çıkmaz anındaki en iyi şekilleri.

1. Çıkmaz anında daha fazla taşım olursa kazandım mı?

2. Çıkmaz anında daha fazla taşım olursa kazandım mı?

3. Çıkmaz anında daha fazla taşım ol


In [None]:
text = "John Moses Browning kimdi? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


John Moses Browning kimdi? 
Browning, John M. (1855-1926)

Browning, John M. (1855-1926)

Browning, John M. (1855-1926)

Browning, John M. (1855-1926)

Browning, John M. (1855-1926)

Browning, John M. (1855-1926)

Browning, John M. (1855


In [None]:
text = "Tek çocuk politikası nedir? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Tek çocuk politikası nedir? 
Tek çocuk politikası, çocuklarının çocukluklarında öğrenmek için öğrencilerin çocuklarının çocukluklarında öğrenmek için öğrencilerin çocukluklarında öğrenmek için öğrencilerin çocukluklarında öğrenmek iç


In [None]:
text = "Uzaylılar dünyayı en son ne zaman ziyaret etti? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Uzaylılar dünyayı en son ne zaman ziyaret etti? 
Uzaylılar dünyayı en son ne zaman ziyaret etti?

Uzaylılar dünyayı en son ne zaman ziyaret etti?

Uzaylılar dünyayı en son ne zaman ziyaret etti?

Uzaylılar dünyayı en son ne zaman ziyaret etti?

Uzaylılar dü


In [None]:
text = "Türkiye'nin başkenti neresidir? "
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Türkiye'nin başkenti neresidir? 
Kraliçe

Kral

Başkan

Başbakan

Köşkbaşkan

Başkanlık

Başbakanlık

Başkanlık

Başbakanlık

Başbakanlık

Başbakanlık

Başbakanlık

Başbakanlık

Başbakanlık
