<a href="https://colab.research.google.com/github/ShafieYusuf/C-sharp-project1/blob/main/GPT_2_XL_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


In [None]:
# File: train_somali_code_generator.py
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import json
import re
from collections import Counter
import random
import torch

# Check if we're in Colab and set device accordingly
if 'google.colab' in str(get_ipython()):
    print('Running in Google Colab. Using T4 GPU.')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load and prepare the model
model_name = "gpt2-xl"  # Changed to GPT-2 XL

tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the model with gradient checkpointing to save memory
model = GPT2LMHeadModel.from_pretrained(model_name)
model.gradient_checkpointing_enable()
model.to(device)

# Add special tokens to the tokenizer
special_tokens = {'pad_token': '<|pad|>', 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>'}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# 2. Load and prepare your JSON dataset
def load_json_data(file_path):
    """Load data from JSON file and format it for training"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    formatted_data = []
    for item in data:
        # Safely access keys using .get() with empty string as default
        prompt = item.get('prompt', '')
        explanation = item.get('explanation', '')
        code = item.get('code', '')
        # Format each example as you specified: Somali prompt –> Somali explanation + code
        formatted_text = f"<|startoftext|>{prompt}Fasiraad: {explanation}Koodh: {code}<|endoftext|>"
        formatted_data.append(formatted_text)

    return formatted_data

# Load your data
train_data = load_json_data('somali_codegen_dataset.json')

# Save formatted data to a temporary file for Hugging Face datasets
with open('formatted_train_data.txt', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(item + '\n')

# 3. Load dataset using Hugging Face datasets
dataset = load_dataset('text', data_files={'train': 'formatted_train_data.txt'})

def tokenize_function(examples):
    # Use shorter sequence length for Colab compatibility
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. Set up training arguments optimized for Colab
training_args = TrainingArguments(
    output_dir="./gpt2-xl-somali-coder",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=2000,  # Changed from 500 to 2000 - saves 4x less often
    save_total_limit=2,  # ONLY KEEP THE 2 MOST RECENT CHECKPOINTS
    logging_steps=100,
    learning_rate=5e-5,
    fp16=True,
    warmup_steps=100,
    logging_dir="./logs",
    report_to="none",
    gradient_checkpointing=True,
)

# 5. Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 6. Create and run trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

print("Starting training...")
trainer.train()

# 7. Save the model
trainer.save_model()
tokenizer.save_pretrained("./gpt2-xl-somali-coder")
print("Training completed and model saved!")

# 8. Define the quality check and fallback functions
def is_low_quality_response(response):
    """Check if the response is low quality"""
    # Check for code block
    if not re.search(r'```.*?```', response, re.DOTALL):
        return True

    # Check for excessive repetition
    words = response.split()
    if len(words) > 15:
        word_counts = Counter(words)
        most_common_word, count = word_counts.most_common(1)[0]
        if count > 8:
            return True

    # Check for very short response
    if len(response) < 50:
        return True

    return False

def get_fallback_response():
    """Provide a fallback response when model doesn't understand"""
    fallback_explanations = [
        "Waan ka shaqeynayaa su'aashaada. Hadaan fahmin si buuxda, hakan waa koodh caadi ah oo laga yaabaa in ay kaa caawiso.",
        "Su'aashu way adag tahay. Halkan waxaa ku jira koodh guud oo ku saabsan sameynta function.",
        "Fahamka su'aasha waa la igu hayaa. Tani waa hab guud oo loo sameeyo."
    ]

    # Construct fallback_code using string concatenation to avoid SyntaxError
    fallback_code = "```python\n# Ku talo: Badal functionka iyo variable-yaasha sifa ay ugu habboonaadaan baahidaaga\ndef function_name(parameter1, parameter2):\n    # Add your logic here\n    result = parameter1 + parameter2 # Example operation\n    return result\n\n# Sida loo isticmaalo:\n# output = function_name(5, 3)\n# print(output) # This will print 8\n```"

    return random.choice(fallback_explanations) + "\n" + fallback_code

# Example usage of the functions (for testing)
# print(is_low_quality_response("This is a short response."))
# print(is_low_quality_response("```print('hello')```"))
# print(is_low_quality_response("word " * 20))
# print(get_fallback_response())

Running in Google Colab. Using T4 GPU.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/117479 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss
100,3.5531
200,2.8452
300,2.5951
400,2.7137
500,2.4683
600,2.3076
700,2.273
800,2.2181
900,2.2191
1000,2.1563


Step,Training Loss
100,3.5531
200,2.8452
300,2.5951
400,2.7137
500,2.4683
600,2.3076
700,2.273
800,2.2181
900,2.2191
1000,2.1563
