In [1]:
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from pathlib import Path
import pandas as pd

In [4]:
# Directory containing text files
data_input_txt = Path('data/txts')
data_output_splits = Path('data/splits')
data_output_splits.mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists

# List to store train and evaluation data for each file
data_records = []

# Reading and splitting all text files from the directory
for file_name in tqdm(os.listdir(data_input_txt)):
    if file_name.endswith('.txt'):
        with open(data_input_txt / file_name, 'r', encoding='utf-8') as file:
            text = file.read()
            if len(text) < 1000:
                continue  # skip files with less than 1000 characters
            text_lines = text.split('\n')
            train_lines, eval_lines = train_test_split(text_lines, test_size=0.2, random_state=42)
            for line in train_lines:
                data_records.append({
                    'file_name': file_name,
                    'split': 'train',
                    'text': line
                })
            for line in eval_lines:
                data_records.append({
                    'file_name': file_name,
                    'split': 'eval',
                    'text': line
                })

df = pd.DataFrame(data_records)
df

  0%|          | 0/165 [00:00<?, ?it/s]

100%|██████████| 165/165 [00:02<00:00, 57.53it/s]


Unnamed: 0,file_name,split,text
0,1-szum-20131104-22-19.txt,train,– Żyjemy w czasach określanych mianem stulecia...
1,1-szum-20131104-22-19.txt,train,Redaktorki Display zdobyły się na wysiłek przy...
2,1-szum-20131104-22-19.txt,train,Umożliwiło mi to jednak dokładne zapoznanie si...
3,1-szum-20131104-22-19.txt,train,Główna część realizacji to metalowy szkielet s...
4,1-szum-20131104-22-19.txt,train,"W Warszawie, poza kończonym budynkiem na Wybrz..."
...,...,...,...
417029,W_J_T_Mitchell_Czego_chca_obrazy_ksiazka.txt,eval,:Q8JFN<GIQ< Bö8;PE8AǴQPBGFCJB@@EEP:?;Q@<...
417030,W_J_T_Mitchell_Czego_chca_obrazy_ksiazka.txt,eval,'@K:?<CC 9PöFE@<9PöFC@K<I8KLIFQE8N:8 NQ@...
417031,W_J_T_Mitchell_Czego_chca_obrazy_ksiazka.txt,eval,:QPDFɛ<A<;E8BFBI<ȴCFE8=FID8JKFJLEB}NJGF...
417032,W_J_T_Mitchell_Czego_chca_obrazy_ksiazka.txt,eval, C8K<>FGIF>I8DFNǛIFQGI8NǴ'@K :?<CC8FKN@<I...


In [12]:
from datasets import DatasetDict, Dataset
import pandas as pd

# Assuming 'df' is the DataFrame created from your text files
# The df columns: 'file_name', 'split', 'text'

# Create separate DataFrames for train and validation
train_df = df[df['split'] == 'train'][['text']]
eval_df = df[df['split'] == 'eval'][['text']]

# Convert DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig
import torch
import os

# Set Hugging Face API token
os.environ['HUGGINGFACE_API_KEY'] = 'Here-Key'

# Configuration parameters
model_name = "bunnycore/LLama-3.1-3b-rp-lora"
dataset_name = "wikitext"
output_dir = "./resultssss"
num_train_epochs = 3
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
optim = "adamw_torch"
save_steps = 1000
logging_steps = 200
learning_rate = 5e-5
weight_decay = 0.01
fp16 = True
bf16 = False
max_grad_norm = 1.0
max_steps = -1
warmup_ratio = 0.1
group_by_length = True
lr_scheduler_type = "linear"
packing = False
max_seq_length = 512
lora_alpha = 16
lora_dropout = 0.1
lora_r = 8
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = True
device_map = "auto"

# Reduce dataset size for training
small_train_dataset = train_dataset
small_eval_dataset = eval_dataset

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ['HUGGINGFACE_API_KEY'])

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_seq_length)

tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)


# Load pre-trained model with quantization configuration for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=os.environ['HUGGINGFACE_API_KEY']
)


# Define the LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["self_attn.k_proj", "self_attn.v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Initialize LoRA with quantized model
model = get_peft_model(model, lora_config)


# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    save_steps=save_steps,
    logging_steps=logging_steps,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)


# Custom Data Collator to return loss
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model(output_dir)



# Testing the fine-tuned model
test_input = "The history of natural language processing"

# Ensure input tensor is on the correct device
test_input_ids = test_input_ids.to(model.device)

# Create attention mask to avoid warnings during generation
attention_mask = (test_input_ids != tokenizer.pad_token_id).long()

# Generate text with attention mask and device consistency
generated_text = model.generate(
    test_input_ids,
    attention_mask=attention_mask,
    max_length=500
)

# Print the generated output
print(tokenizer.decode(generated_text[0], skip_special_tokens=True))

Map: 100%|██████████| 333562/333562 [00:59<00:00, 5606.22 examples/s]
Map: 100%|██████████| 83472/83472 [00:14<00:00, 5632.47 examples/s]
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
  0%|          | 200/500343 [05:01<208:15:14,  1.50s/it]

{'loss': 4.6983, 'grad_norm': 2.4613547325134277, 'learning_rate': 1.9686219646247629e-07, 'epoch': 0.0}


  0%|          | 400/500343 [10:03<209:20:24,  1.51s/it]

{'loss': 4.6591, 'grad_norm': 2.609933376312256, 'learning_rate': 3.9672229439392424e-07, 'epoch': 0.0}


  0%|          | 600/500343 [15:03<208:50:39,  1.50s/it]

{'loss': 4.373, 'grad_norm': 1.5534205436706543, 'learning_rate': 5.965823923253723e-07, 'epoch': 0.0}


  0%|          | 665/500343 [16:40<199:03:19,  1.43s/it]