In [2]:
import os
import transformers
import torch
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [12]:
from datasets import Dataset
import os

# Path to your dataset directory
data_dir = "Stutter_Main 3" 

# Load the text files from the directory
text_data = []
for file_name in os.listdir(data_dir):
    if file_name.endswith(".txt"):
        with open(os.path.join(data_dir, file_name), "r", encoding="utf-8") as file:
            text_data.append(file.read())

# Create a dataset dictionary
dataset_dict = {
    "text": text_data,
    "label": [0] * len(text_data)  # Dummy labels, replace with your actual labels
}

# Create a Dataset object
dataset = Dataset.from_dict(dataset_dict)

# Print the first few examples
print(dataset[0])

{'text': 'Spasmodic dysphonia, also known as laryngeal dystonia, is a disorder in which the muscles that generate a person\'s voice go into periods of spasm.[1][2] This results in breaks or interruptions in the voice, often every few sentences, which can make a person difficult to understand.[1] The person\'s voice may also sound strained or they may be nearly unable to speak.[2] Onset is often gradual and the condition is lifelong.[1]\n\n\nThe cause is unknown.[1] Risk factors may include family history.[1] Triggers may include an upper respiratory infection, injury to the larynx, overuse of the voice, and psychological stress.[1] The underlying mechanism is believed to typically involve the central nervous system, specifically the basal ganglia.[1] Diagnosis is typically made following examination by a team of healthcare providers.[1]  It is a type of focal dystonia.[3]\n\n\nWhile there is no cure, treatment may improve symptoms.[1] Most commonly this involves injecting botulinum tox

In [5]:
#Put your Hugging Face Token here
os.environ["HF_TOKEN"] = ""

In [39]:
model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [40]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [25:31<00:00, 382.76s/it]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.42s/it]


In [41]:
text = "Quote: Imagination is more,"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Imagination is more, more important than knowledge.

Albert Einstein

The world is a place of wonder and imagination. It


In [42]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [43]:
trainer = SFTTrainer(
    model=model,
    packing=True,
    train_dataset=dataset,  # Adjust this if your dataset structure is different
    dataset_text_field="text",  # Specify the text field in your dataset
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=1000,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
)

Generating train split: 76 examples [00:00, 1198.62 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [44]:
trainer.train()

Step,Training Loss
1,22.7668
2,20.0205
3,19.7335
4,19.4531
5,15.1975
6,11.1432
7,8.0694
8,6.8302
9,4.9007
10,5.6799




TrainOutput(global_step=1000, training_loss=0.4166639634571038, metrics={'train_runtime': 2359.2263, 'train_samples_per_second': 1.695, 'train_steps_per_second': 0.424, 'total_flos': 1.9110914162688e+17, 'train_loss': 0.4166639634571038, 'epoch': 52.63157894736842})

In [46]:
output_dir = "saved-model"
trainer.save_model(output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



('saved-model/tokenizer_config.json',
 'saved-model/special_tokens_map.json',
 'saved-model/tokenizer.json')

In [54]:
text = "what is Stuttering?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

what is Stuttering?


Stuttering is a speech disorder characterized by involuntary, unpredictable disruptions in the flow of speech. These disruptions, known as disfluencies, can occur anywhere within a sentence and may last for any length of time. The person who stutters may or may not be aware of the disfluency as it happens.





In [56]:
import zipfile
import os

def zip_folder(folder_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Example usage:
folder_to_zip = "saved-model"
zip_file_path = "model-saved-7b.zip"

zip_folder(folder_to_zip, zip_file_path)
