# Install and import required libraries

In [1]:
!pip install accelerate transformers einops datasets peft bitsandbytes --upgrade



In [2]:
import huggingface_hub
from google.colab import userdata
huggingface_hub.login(token=userdata.get('hf_token'), add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, GenerationConfig
from peft import LoraConfig, get_peft_model
import os

# Get tokenizer from microsoft/phi-1_5

In [4]:
model_repo = "microsoft/phi-1_5"
data_repo = "WKLI22/Raiders_of_the_Lost_Kek_Chat"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)#, padding_side='left')

tokenizer.pad_token = tokenizer.eos_token

# Get LLM model from microsoft/phi-1_5

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

In [7]:
print(model)
device = next(model.parameters()).device
device.type

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernor

'cuda'

# Prepare PEFT model for our LLM

In [8]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

load_peft=True
if load_peft:
    config = PeftConfig.from_pretrained("WKLI22/phi-1_5-finetuned-4chan")
    model = PeftModel.from_pretrained(model, "WKLI22/phi-1_5-finetuned-4chan")
    for name, param in model.named_parameters():
        if 'lora' in name or 'Lora' in name:
            param.requires_grad = True
else:
    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

adapter_config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/56.7M [00:00<?, ?B/s]

trainable params: 14,155,776 || all params: 1,432,426,496 || trainable%: 0.9882375144225202


# Get Dataset from WKLI22/Raiders_of_the_Lost_Kek_Chat

In [9]:
data = load_dataset("WKLI22/Raiders_of_the_Lost_Kek_Chat", streaming=True)

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [10]:
_tmp = []
for i,_ in enumerate(data["train"]):
  if(i>3):
    break
  _tmp.append(_["content"])

In [11]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)#, padding_side='left')

tokenizer.pad_token = tokenizer.eos_token
outputs = tokenizer(
    _tmp,
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 36
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 107, 128, 128, 128, 128, 128, 128, 128, 128, 126, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 78, 128, 128, 128, 128, 128, 128, 23]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]


In [12]:
def detokenize(token_ids):
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    return decoded_text

In [13]:
def tokenize(element, context_length=context_length):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


In [14]:
data_train_shuffle = data["train"].shuffle(seed=42).take(10_000)
data_train_shuffle

IterableDataset({
    features: ['content'],
    n_shards: 42
})

In [15]:
data_test_shuffle = data["test"].shuffle(seed=42).take(100)
data_test_shuffle

IterableDataset({
    features: ['content'],
    n_shards: 1
})

In [16]:

tokenized_data_train = data_train_shuffle.map(
    tokenize, batched=True, remove_columns=data["train"].column_names
)
tokenized_data_train

IterableDataset({
    features: Unknown,
    n_shards: 42
})

In [17]:
tokenized_data_test = data_test_shuffle.map(
    tokenize, batched=True,remove_columns=data["test"].column_names
)
tokenized_data_test

IterableDataset({
    features: Unknown,
    n_shards: 1
})

# Training

In [18]:
training_arguments = TrainingArguments(
    output_dir="phi-1_5-finetuned-4chan",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    evaluation_strategy="steps",
    eval_steps=2,
    logging_steps=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1,
    lr_scheduler_type="cosine",
    learning_rate=5e-3,
    save_steps=5,
    max_steps=500,
    fp16=True,
    push_to_hub=True,
)
training_arguments = TrainingArguments(
        output_dir="phi-1_5-finetuned-4chan",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        learning_rate=2e-3,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        lr_scheduler_type="cosine",
        logging_steps=1,
        max_steps=500,
        num_train_epochs=1,
        save_total_limit=5,
        fp16=True,
        push_to_hub=True,
)
#save_strategy="epoch",


In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data_train,
    eval_dataset=tokenized_data_test,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
50,3.4182,3.466731
100,3.3506,3.443477
150,3.3223,3.421112
200,3.5472,3.402459
250,3.2816,3.381698
300,3.1114,3.361852
350,3.3596,3.341203
400,3.3336,3.329765


In [None]:
!huggingface-cli upload "phi-1_5-finetuned-4chan"


# Save model

In [None]:
model.save_pretrained("phi-1_5-finetuned-4chan")

# Merge model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_repo, trust_remote_code=True, torch_dtype=torch.float32)

peft_model = PeftModel.from_pretrained(model, "phi-1_5-finetuned-4chan", from_transformers=True)

model = peft_model.merge_and_unload()

In [None]:
model.save_pretrained("phi-1_5-finetuned-4chan")

# model.push_to_hub("llm-exp/phi-1_5-finetuned-4chan")

# Load model @local

In [None]:
from transformers import AutoModel

# Replace 'path_to_your_model_directory' with the actual path to your model directory
model_path = "phi-1_5-finetuned-4chan"

# Load the model from the local directory
model = AutoModel.from_pretrained(model_path)

# Now you can use the model as usual


# Load model @Huggingface

In [None]:
#from peft import PeftModel, PeftConfig
#from transformers import AutoModelForCausalLM

config = PeftConfig.from_pretrained("WKLI22/phi-1_5-finetuned-4chan")
#model = AutoModelForCausalLM.from_pretrained("Open-Orca/oo-phi-1_5")
model = PeftModel.from_pretrained(model, "WKLI22/phi-1_5-finetuned-4chan")

# Inference

In [None]:
id = 67
def show(text):
  _l = text.split('. ')
  return '.\n'.join(_l)
print(show(data_df.iloc[id]['Prompt']),'\n')
print(show(data_df.iloc[id]['Completion']))

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

#model = AutoModelForCausalLM.from_pretrained("phi-1_5-finetuned-med-text", trust_remote_code=True, torch_dtype=torch.float32)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#tokenizer = AutoTokenizer.from_pretrained("Open-Orca/oo-phi-1_5", trust_remote_code=True)

#prompt = data_df.iloc[id]['Prompt']
#prompt = "A 30 year old male that has frequent diarrhea after consuming food. "\
#         "What could be the diagnosis and plan?"

prompt = "A 30 year old male that has frequent diarrhea after consuming food. "\
         "What may be the cause?"

generation_config_orig = GenerationConfig(
    max_length=1024, temperature=0.01, top_p=0.95, repetition_penalty=1.1,
    do_sample=True, use_cache=True,
    eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id,
    transformers_version="4.33.1"
    )

input_text = preprocess(prompt=prompt)
inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=False).to('cuda')
outputs = model.generate(**inputs, generation_config=generation_config_orig)

text = tokenizer.batch_decode(outputs)[0]
print(show(text))


In [None]:

tokenizer.pad_token = tokenizer.eos_token
# Define prompts
prompts = [
    data_df.iloc[12]['Prompt'],
    data_df.iloc[23]['Prompt'],
    data_df.iloc[145]['Prompt'],
    "A 30 year old male that has frequent diarrhea after consuming food. What could be the diagnosis and plan?",
    "A 30 year old male that has frequent diarrhea after consuming food. What may be the cause?",
    "I have frequent diarrhea after recovering from COVID, what can I do?",
    "I usually have sore eyes when I wake up in the morning, is something wrong?",
]
#prompts = [preprocess(p) for p in prompts_raw]

# Define generation configuration
generation_config_orig = {
    "max_length": 512,
    "temperature": 0.01,
    "top_p": 0.95,
    "repetition_penalty": 1.1,
    "do_sample": True,
    "use_cache": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}
#    "transformers_version": "4.33.1"
#}

# Preprocess prompts and tokenize
def tokenize_bulk(prompts):
    return tokenizer(prompts, return_tensors="pt", return_attention_mask=False, padding=True, truncation=True, max_length=512)

inputs = tokenize_bulk(prompts).to(device)

# Generate outputs
outputs = model.generate(**inputs, **generation_config_orig)

# Decode outputs
decoded_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print results
for i, text in enumerate(decoded_texts):
    print(show(text), '\n\n')
    #print(f"Prompt {i+1}: {prompts_raw[i]}\nGenerated Text: {show(text)}\n\n")


# Upload to Hugginghace

In [None]:
!huggingface-cli login

In [None]:
!huggingface-cli upload oo-phi-1_5-finetuned-med-text

In [None]:
data_df.iloc[0]['Prompt']

In [None]:
data_df.iloc[0]['Completion']

In [None]:
data_df.iloc[0]['text']