<a href="https://colab.research.google.com/github/RafalW3bCraft/RWC-FinTunna/blob/main/RWC_FinTunna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RWC-FinTunna



# System_Configuretion

In [1]:
import torch
import os

print("🚀 SYSTEM INFORMATION")
print("=" * 50)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("❌ No GPU detected! Please enable GPU in Runtime -> Change runtime type")

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging initially


🚀 SYSTEM INFORMATION
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU Device: Tesla T4
GPU Memory: 14.7 GB
CUDA version: 12.6


In [4]:
print("📦 Installing Unsloth and dependencies for efficient fine-tuning [95][97][99]...")

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes xformers datasets -q
!pip install transformers diffusers opencv-python pillow matplotlib scikit-learn pandas numpy -q

print("✅ Installation completed!")


📦 Installing Unsloth and dependencies for efficient fine-tuning [95][97][99]...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
✅ Installation completed!


In [7]:
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Hugging Face libraries
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForCausalLM,
    AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq,
    BitsAndBytesConfig, pipeline
)
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# Unsloth for efficient training [5][42][99]
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer # Removed SFTConfig

# For image generation capabilities
from diffusers import StableDiffusionPipeline, DiffusionPipeline
from PIL import Image

print("✅ All libraries imported successfully!")
print("🔧 Setting up project configuration...")

✅ All libraries imported successfully!
🔧 Setting up project configuration...


In [8]:
# Project configuration and global settings
class ProjectConfig:

    # Model configurations
    STARCODER_MODEL = "bigcode/starcoder2-3b"         # Code generation [62][68]
    ROBERTA_MODEL = "roberta-base"                    # Classification tasks [63]
    FLAN_T5_MODEL = "google/flan-t5-small"           # Text-to-text [116]

    # Training parameters optimized for T4 GPU [7][96][101]
    MAX_SEQ_LENGTH = 1024                             # Reduced for T4 memory
    BATCH_SIZE = 1                                    # Conservative for T4
    GRADIENT_ACCUMULATION_STEPS = 8                   # Simulate larger batch
    LEARNING_RATE = 2e-4                              # Standard for LoRA
    WARMUP_STEPS = 100
    MAX_STEPS = 500                                   # Start with smaller steps

    # LoRA configuration for efficient fine-tuning [48][101]
    LORA_R = 16                                       # Rank for LoRA adapters
    LORA_ALPHA = 32                                   # Scaling factor
    LORA_DROPOUT = 0.05                               # Dropout for regularization

    # Quantization config for memory efficiency [44][49]
    USE_4BIT = True
    BNB_4BIT_COMPUTE_DTYPE = torch.bfloat16
    BNB_4BIT_QUANT_TYPE = "nf4"
    BNB_4BIT_USE_DOUBLE_QUANT = True

    # Output directories
    OUTPUT_DIR = "./outputs"
    STARCODER_OUTPUT = f"{OUTPUT_DIR}/starcoder2-3b-finetuned"
    ROBERTA_OUTPUT = f"{OUTPUT_DIR}/roberta-finetuned"
    FLAN_T5_OUTPUT = f"{OUTPUT_DIR}/flan-t5-finetuned"

# Create output directories
import os
for dir_path in [ProjectConfig.OUTPUT_DIR, ProjectConfig.STARCODER_OUTPUT,
                 ProjectConfig.ROBERTA_OUTPUT, ProjectConfig.FLAN_T5_OUTPUT]:
    os.makedirs(dir_path, exist_ok=True)

print("⚙️ Configuration loaded successfully!")
print(f"🎯 Models to fine-tune:")
print(f"  1. Code Generation: {ProjectConfig.STARCODER_MODEL}")
print(f"  2. Text Classification: {ProjectConfig.ROBERTA_MODEL}")
print(f"  3. Text-to-Text: {ProjectConfig.FLAN_T5_MODEL}")


⚙️ Configuration loaded successfully!
🎯 Models to fine-tune:
  1. Code Generation: bigcode/starcoder2-3b
  2. Text Classification: roberta-base
  3. Text-to-Text: google/flan-t5-small


In [9]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        print(f"🔋 GPU Memory - Allocated: {torch.cuda.memory_allocated()/1024**3:.1f}GB")
        print(f"🔋 GPU Memory - Reserved: {torch.cuda.memory_reserved()/1024**3:.1f}GB")
    else:
        print("❌ No GPU available")

def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("🧹 GPU memory cleared")

def setup_quantization_config():
    return BitsAndBytesConfig(
        load_in_4bit=ProjectConfig.USE_4BIT,
        bnb_4bit_compute_dtype=ProjectConfig.BNB_4BIT_COMPUTE_DTYPE,
        bnb_4bit_quant_type=ProjectConfig.BNB_4BIT_QUANT_TYPE,
        bnb_4bit_use_double_quant=ProjectConfig.BNB_4BIT_USE_DOUBLE_QUANT,
    )

print("🛠️ Utility functions ready!")
print_gpu_utilization()


🛠️ Utility functions ready!
🔋 GPU Memory - Allocated: 0.0GB
🔋 GPU Memory - Reserved: 0.0GB


In [11]:
# Authenticate with Hugging Face (optional but recommended)
from huggingface_hub import notebook_login

print("🔐 Hugging Face Authentication")
print("This is optional but recommended for downloading private models")
print("and uploading your fine-tuned models")

try:
    notebook_login()
    print("✅ Ready to proceed (authentication skipped)")
except Exception as e:
    print(f"⚠️ Authentication failed: {e}")
    print("Continuing without authentication...")


🔐 Hugging Face Authentication
This is optional but recommended for downloading private models
and uploading your fine-tuned models


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Ready to proceed (authentication skipped)


In [13]:
print("🚀 SETTING UP STARCODER2-3B FOR CODE GENERATION")
print("=" * 60)

def setup_starcoder2_model():

    clear_memory()

    # Use Unsloth for efficient loading [42][62]
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=ProjectConfig.STARCODER_MODEL,
        max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
        dtype=None,  # Auto-detect
        load_in_4bit=True,
        # token="hf_..." # Uncomment if using private models
    )

    # Add LoRA adapters for efficient fine-tuning [7][96]
    model = FastLanguageModel.get_peft_model(
        model,
        r=ProjectConfig.LORA_R,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        lora_alpha=ProjectConfig.LORA_ALPHA,
        lora_dropout=ProjectConfig.LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    print(f"✅ StarCoder2-3B loaded successfully!")
    print_gpu_utilization()

    return model, tokenizer

# Initialize StarCoder2 (commented out to save memory during setup)
starcoder_model, starcoder_tokenizer = setup_starcoder2_model()
print("📝 StarCoder2-3B setup function ready!")


🚀 SETTING UP STARCODER2-3B FOR CODE GENERATION
🧹 GPU memory cleared
==((====))==  Unsloth 2025.10.1: Fast Starcoder2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

bigcode/starcoder2-3b does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients
✅ StarCoder2-3B loaded successfully!
🔋 GPU Memory - Allocated: 1.7GB
🔋 GPU Memory - Reserved: 2.9GB
📝 StarCoder2-3B setup function ready!


In [14]:
print("🚀 SETTING UP ROBERTA BASE FOR CLASSIFICATION")
print("=" * 60)

def setup_roberta_model(num_labels=2):

    clear_memory()

    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)

    quantization_config = setup_quantization_config()

    # Load model for sequence classification [114][117]
    model = AutoModelForSequenceClassification.from_pretrained(
        ProjectConfig.ROBERTA_MODEL,
        num_labels=num_labels,
        quantization_config=quantization_config,
        device_map="auto"
    )

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration for classification [123]
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=ProjectConfig.LORA_R,
        lora_alpha=ProjectConfig.LORA_ALPHA,
        target_modules=["query", "value", "key", "dense"],
        lora_dropout=ProjectConfig.LORA_DROPOUT,
        bias="none",
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    print(f"✅ RoBERTa Base loaded successfully!")
    print_gpu_utilization()

    return model, tokenizer

# Initialize RoBERTa (commented out to save memory during setup)
roberta_model, roberta_tokenizer = setup_roberta_model()
print("📝 RoBERTa Base setup function ready!")


🚀 SETTING UP ROBERTA BASE FOR CLASSIFICATION
🧹 GPU memory cleared


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ RoBERTa Base loaded successfully!
🔋 GPU Memory - Allocated: 1.9GB
🔋 GPU Memory - Reserved: 3.0GB
📝 RoBERTa Base setup function ready!


In [15]:
print("🚀 SETTING UP FLAN-T5-SMALL FOR TEXT-TO-TEXT")
print("=" * 60)

def setup_flan_t5_model():

    clear_memory()

    # Load tokenizer [109][113][116]
    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.FLAN_T5_MODEL)

    # Quantization config
    quantization_config = setup_quantization_config()

    # Load model for sequence-to-sequence [25][31]
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ProjectConfig.FLAN_T5_MODEL,
        quantization_config=quantization_config,
        device_map="auto"
    )

    model = prepare_model_for_kbit_training(model)

    # LoRA configuration for seq2seq [122]
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=ProjectConfig.LORA_R,
        lora_alpha=ProjectConfig.LORA_ALPHA,
        target_modules=["q", "v", "k", "o", "wi_0", "wi_1", "wo"],
        lora_dropout=ProjectConfig.LORA_DROPOUT,
        bias="none",
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    print(f"✅ Flan-T5-Small loaded successfully!")
    print_gpu_utilization()

    return model, tokenizer

# Initialize Flan-T5 (commented out to save memory during setup)
flan_t5_model, flan_t5_tokenizer = setup_flan_t5_model()
print("📝 Flan-T5-Small setup function ready!")


🚀 SETTING UP FLAN-T5-SMALL FOR TEXT-TO-TEXT
🧹 GPU memory cleared


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Flan-T5-Small loaded successfully!
🔋 GPU Memory - Allocated: 2.1GB
🔋 GPU Memory - Reserved: 3.0GB
📝 Flan-T5-Small setup function ready!


In [16]:
print("⚙️ SETTING UP TRAINING CONFIGURATIONS")
print("=" * 60)

def get_starcoder_training_args():
    """Training arguments for StarCoder2-3B code generation"""
    return SFTConfig(
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        max_steps=ProjectConfig.MAX_STEPS,
        learning_rate=ProjectConfig.LEARNING_RATE,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=ProjectConfig.STARCODER_OUTPUT,
        max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
    )

def get_roberta_training_args():
    """Training arguments for RoBERTa classification"""
    return TrainingArguments(
        output_dir=ProjectConfig.ROBERTA_OUTPUT,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        logging_steps=10,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        dataloader_pin_memory=False,
        remove_unused_columns=False,
    )

def get_flan_t5_training_args():
    """Training arguments for Flan-T5 text-to-text"""
    return TrainingArguments(
        output_dir=ProjectConfig.FLAN_T5_OUTPUT,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        logging_steps=10,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        predict_with_generate=True,
        dataloader_pin_memory=False,
    )

print("✅ Training configurations ready!")
print("📊 Next steps: Load datasets and start training individual models")

⚙️ SETTING UP TRAINING CONFIGURATIONS
✅ Training configurations ready!
📊 Next steps: Load datasets and start training individual models


In [17]:
# === PROJECT STATUS & NEXT STEPS ===
print("🎯 PROJECT INITIALIZATION COMPLETE!")
print("=" * 60)

print("✅ COMPLETED SETUP:")
print("  • GPU configuration verified")
print("  • All dependencies installed")
print("  • Model setup functions ready")
print("  • Training configurations prepared")
print("  • Memory monitoring utilities active")

print("\n🚀 READY FOR PHASE ONE:")
print("  1. StarCoder2-3B: Code generation fine-tuning")
print("  2. RoBERTa Base: Classification tasks")
print("  3. Flan-T5-Small: Enhanced chat capabilities")

print("\n⚠️ IMPORTANT NOTES:")
print("  • Models are not loaded yet (to save memory)")
print("  • Load one model at a time for training")
print("  • Monitor GPU memory usage closely")
print("  • Use gradient checkpointing for memory efficiency")

print("\n📝 NEXT ACTIONS:")
print("  1. Prepare training datasets for each model")
print("  2. Load and train StarCoder2-3B first")
print("  3. Clear memory and train RoBERTa")
print("  4. Clear memory and train Flan-T5")
print("  5. Implement multimodal fusion (Phase 2)")

print(f"\n💾 Output directories created:")
for output_dir in [ProjectConfig.STARCODER_OUTPUT, ProjectConfig.ROBERTA_OUTPUT, ProjectConfig.FLAN_T5_OUTPUT]:
    print(f"  • {output_dir}")

print("\n🔄 Ready to begin Phase One implementation!")
print_gpu_utilization()


🎯 PROJECT INITIALIZATION COMPLETE!
✅ COMPLETED SETUP:
  • GPU configuration verified
  • All dependencies installed
  • Model setup functions ready
  • Training configurations prepared
  • Memory monitoring utilities active

🚀 READY FOR PHASE ONE:
  1. StarCoder2-3B: Code generation fine-tuning
  2. RoBERTa Base: Classification tasks
  3. Flan-T5-Small: Enhanced chat capabilities

⚠️ IMPORTANT NOTES:
  • Models are not loaded yet (to save memory)
  • Load one model at a time for training
  • Monitor GPU memory usage closely
  • Use gradient checkpointing for memory efficiency

📝 NEXT ACTIONS:
  1. Prepare training datasets for each model
  2. Load and train StarCoder2-3B first
  3. Clear memory and train RoBERTa
  4. Clear memory and train Flan-T5
  5. Implement multimodal fusion (Phase 2)

💾 Output directories created:
  • ./outputs/starcoder2-3b-finetuned
  • ./outputs/roberta-finetuned
  • ./outputs/flan-t5-finetuned

🔄 Ready to begin Phase One implementation!
🔋 GPU Memory - Allocat

# **1️⃣ Code Generation / Programming Datasets (17+)**

| # | Dataset | Description | Notes / Language | Link |
|---|---------|------------|-----------------|------|
| 1 | `codeparrot/codeparrot-clean` | Clean Python GitHub code | Python | [Link](https://huggingface.co/datasets/codeparrot/codeparrot-clean) |
| 2 | `bigcode/the-stack` | Large multi-language code corpus | Multi-language | [Link](https://huggingface.co/datasets/bigcode/the-stack) |
| 3 | `bigcode/the-stack-smol` | Small subset for testing | Multi-language | [Link](https://huggingface.co/datasets/bigcode/the-stack-smol) |
| 4 | `codeparrot/github-jupyter-text` | Jupyter notebooks, code + markdown | Python | [Link](https://huggingface.co/datasets/codeparrot/github-jupyter-text) |
| 5 | `codeparrot/cleaned-java` | Cleaned Java code | Java | [Link](https://huggingface.co/datasets/codeparrot/cleaned-java) |
| 6 | `codeparrot/cleaned-cpp` | Cleaned C++ code | C++ | [Link](https://huggingface.co/datasets/codeparrot/cleaned-cpp) |
| 7 | `codeparrot/cleaned-javascript` | Cleaned JS code | JavaScript | [Link](https://huggingface.co/datasets/codeparrot/cleaned-javascript) |
| 8 | `HuggingFace Code Dataset hub` | Collection of code datasets | Multiple | [Link](https://huggingface.co/datasets?search=code) |
| 9 | `CodeSearchNet` | Code + docstring pairs for search | Python, Java, Go, etc. | [Link](https://huggingface.co/datasets/code_search_net) |
| 10 | `CodeContests` | Competitive programming solutions | Python, C++ | [Link](https://huggingface.co/datasets/codecontests) |
| 11 | `HackerRank` | Code challenges and solutions | Python, Java, C++ | [Link](https://www.kaggle.com/datasets/anishathalye/hackerrank-solutions) |
| 12 | `Funcom` | Code summarization dataset | Python | [Link](https://huggingface.co/datasets/funcom) |
| 13 | `CoNaLa` | Code/Natural Language dataset | Python | [Link](https://huggingface.co/datasets/conala) |
| 14 | `Java-Large` | Large Java dataset | Java | [Link](https://huggingface.co/datasets/mbpp) |
| 15 | `CodeXGLUE` | Multiple code tasks (translation, repair, summarization) | Multi-language | [Link](https://github.com/microsoft/CodeXGLUE) |
| 16 | `MBPP` | Python code + natural language | Python | [Link](https://huggingface.co/datasets/mbpp) |
| 17 | `CodeNet` | Large-scale multi-language code dataset | Multi-language | [Link](https://huggingface.co/datasets/codexglue) |

---

# **2️⃣ NLP / Text Classification / Sentiment Datasets (17+)**

| # | Dataset | Description | Notes | Link |
|---|---------|------------|------|------|
| 1 | `glue/sst2` | Stanford Sentiment Treebank | Binary sentiment | [Link](https://huggingface.co/datasets/glue/viewer/sst2) |
| 2 | IMDB | Movie reviews, binary sentiment | English | [Link](https://huggingface.co/datasets/imdb) |
| 3 | Amazon Reviews | Multi-domain reviews | Multi-class ratings | [Link](https://huggingface.co/datasets/amazon_reviews_multi) |
| 4 | Yelp Reviews | User reviews, multi-class | English | [Link](https://huggingface.co/datasets/yelp_review_full) |
| 5 | Emotion | Multi-class emotion labels | anger, joy, sadness, etc. | [Link](https://huggingface.co/datasets/emotion) |
| 6 | TweetEval | Twitter sentiment | Short informal texts | [Link](https://huggingface.co/datasets/tweeteval) |
| 7 | AG News | News classification | 4 classes | [Link](https://huggingface.co/datasets/ag_news) |
| 8 | DBpedia | Wikipedia text classification | 14 classes | [Link](https://huggingface.co/datasets/dbpedia_14) |
| 9 | 20 Newsgroups | Topic classification | 20 classes | [Link](https://huggingface.co/datasets/20_newsgroups) |
| 10 | CoLA | Grammatical acceptability | Binary | [Link](https://huggingface.co/datasets/glue/viewer/cola) |
| 11 | TREC | Question classification | 6 classes | [Link](https://huggingface.co/datasets/trec) |
| 12 | Financial PhraseBank | Financial sentiment | Positive, negative, neutral | [Link](https://huggingface.co/datasets/financial_phrasebank) |
| 13 | Toxic Comment Classification | Multi-label toxicity | English | [Link](https://huggingface.co/datasets/jigsaw_toxicity_pred) |
| 14 | Multi-Domain Sentiment Dataset (MDSD) | Multi-domain reviews | English | [Link](https://huggingface.co/datasets/mdsd) |
| 15 | Yelp Polarity | Binary sentiment | English | [Link](https://huggingface.co/datasets/yelp_polarity) |
| 16 | Amazon Polarity | Binary sentiment | English | [Link](https://huggingface.co/datasets/amazon_polarity) |
| 17 | SMS Spam Collection | Spam detection | English | [Link](https://huggingface.co/datasets/sms_spam) |

---

# **3️⃣ Instruction-Tuning / Instruction-Following Datasets (17+)**

| # | Dataset | Description | Notes | Link |
|---|---------|------------|------|------|
| 1 | `yahma/alpaca-cleaned` | Instruction + output | 52k examples | [Link](https://huggingface.co/datasets/yahma/alpaca-cleaned) |
| 2 | Stanford Self-Instruct | Multi-domain instructions | 175k+ | [Link](https://huggingface.co/datasets/StanfordAI/self_instruct) |
| 3 | OpenAssistant/oasst1 | Open-source ChatGPT-like instructions | 160k+ | [Link](https://huggingface.co/datasets/OpenAssistant/oasst1) |
| 4 | ShareGPT | Conversations scraped from ChatGPT | Multi-turn dialogue | [Link](https://huggingface.co/datasets/ShareGPT) |
| 5 | Dolly 2.0 | Instruction + output | 15k examples | [Link](https://huggingface.co/datasets/databricks/dolly_2_0) |
| 6 | WizardLM | Instruction + reasoning dataset | 100k+ | [Link](https://huggingface.co/datasets/anon8231489123/wizardlm_unfiltered) |
| 7 | Alpaca-GPT4 | GPT-4 generated instructions | 52k examples | [Link](https://huggingface.co/datasets/tatsu-lab/alpaca_gpt4) |
| 8 | Evol-Instruct | Evolutionary instruction dataset | 100k+ | [Link](https://huggingface.co/datasets/evol-instruct) |
| 9 | FLAN v2 | Google’s FLAN instruction dataset | Multi-task | [Link](https://huggingface.co/datasets/google/flan_v2) |
| 10 | Tulu | Multilingual instruction dataset | 80k+ | [Link](https://huggingface.co/datasets/tulu) |
| 11 | CoT-Instruct | Chain-of-thought instructions | Reasoning tasks | [Link](https://huggingface.co/datasets/cot_instruct) |
| 12 | Super-NaturalInstructions | 1,600+ diverse tasks | Multi-task | [Link](https://huggingface.co/datasets/super_natural_instructions) |
| 13 | MPT-Instruct | MosaicML instruction dataset | Multi-domain | [Link](https://huggingface.co/datasets/mpt_instruct) |
| 14 | Vicuna Dataset | 70k+ instruction-response pairs | Open-source ChatGPT-like | [Link](https://huggingface.co/datasets/vicuna_dataset) |
| 15 | OpenInstruct | 100k+ instructions | Multi-domain | [Link](https://huggingface.co/datasets/openinstruct) |
| 16 | LIMA | Fine-tuning dataset for instruction-following | ~1k high-quality examples | [Link](https://github.com/StanfordAI/LIMA) |
| 17 | Koala | Instruction dataset | ~100k examples | [Link](https://github.com/karpathy/koala) |

---


# Dataset Preprocessing

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    STARCODER_MODEL = "bigcode/starcoder2-3b"
    MAX_SEQ_LENGTH = 1024   # adjust per your VRAM (e.g., 512, 2048)
    DATASET_SPLIT = "train[:1%]"  # smaller split for testing

print("Loading dataset...")
code_dataset = load_dataset("codeparrot/codeparrot-clean", split=ProjectConfig.DATASET_SPLIT)

print("Loading tokenizer...")
starcoder_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.STARCODER_MODEL)

def preprocess_code(example):
    code_sample = example["content"]

    # Tokenize (return lists, not tensors)
    tokenized = starcoder_tokenizer(
        code_sample,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
    )

    return tokenized


print("Tokenizing dataset...")
tokenized_code_dataset = code_dataset.map(
    preprocess_code,
    remove_columns=code_dataset.column_names,
    batched=True,          # faster
    num_proc=4             # parallelize if possible
)

print("✅ StarCoder2-3B dataset loaded & tokenized!")
print(tokenized_code_dataset)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    ROBERTA_MODEL = "roberta-base"
    MAX_SEQ_LENGTH = 128

print("Loading tokenizer...")
roberta_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)

print("Loading GLUE SST-2 dataset...")
glue_train = load_dataset("glue", "sst2", split="train")
glue_val = load_dataset("glue", "sst2", split="validation")

def preprocess_glue(example):
    return roberta_tokenizer(
        example["sentence"],
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",   # ensures consistent input length
    )


print("Tokenizing datasets...")
tokenized_train = glue_train.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_train.column_names,
    desc="Tokenizing train split"
)

tokenized_val = glue_val.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_val.column_names,
    desc="Tokenizing validation split"
)

print("✅ RoBERTa Base dataset loaded & tokenized!")
print(tokenized_train)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    FLAN_T5_MODEL = "google/flan-t5-base"
    MAX_SEQ_LENGTH = 256
    DATASET_SPLIT = "train[:5%]"  # small slice for testing / Colab RAM

print("Loading tokenizer...")
flan_t5_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.FLAN_T5_MODEL)

print("Loading Alpaca-cleaned dataset...")
flan_t5_data = load_dataset("yahma/alpaca-cleaned", split=ProjectConfig.DATASET_SPLIT)

def preprocess_alpaca(example):
    input_text = example.get("input", "")
    prompt = example["instruction"]
    if input_text.strip():
        prompt += "\n" + input_text

    target = example["output"]

    # Tokenize prompt and target separately
    model_inputs = flan_t5_tokenizer(
        prompt,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",
    )

    labels = flan_t5_tokenizer(
        target,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

print("Tokenizing Alpaca dataset...")
tokenized_alpaca = flan_t5_data.map(
    preprocess_alpaca,
    batched=False,
    remove_columns=flan_t5_data.column_names,
    desc="Tokenizing Alpaca"
)

print("✅ Flan-T5 dataset loaded & tokenized!")
print(tokenized_alpaca[0])


In [None]:
# Unified # Dataset Preprocessing
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    # StarCoder2-3B (code)
    STARCODER_MODEL = "bigcode/starcoder2-3b"
    CODE_MAX_SEQ_LENGTH = 1024
    CODE_SPLIT = "train[:1%]"  # small slice for testing

    # RoBERTa Base (GLUE SST-2)
    ROBERTA_MODEL = "roberta-base"
    ROBERTA_MAX_SEQ_LENGTH = 128

    # Flan-T5 Base (Instruction Tuning)
    FLAN_T5_MODEL = "google/flan-t5-base"
    FLAN_MAX_SEQ_LENGTH = 256
    FLAN_SPLIT = "train[:5%]"

# 1. StarCoder2-3B / CodeParrot
print("\n=== Loading StarCoder2-3B (CodeParrot) ===")
starcoder_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.STARCODER_MODEL)
try:
    code_dataset = load_dataset("codeparrot/codeparrot-clean", split=ProjectConfig.CODE_SPLIT)
except Exception as e:
    print("Error loading CodeParrot dataset:", e)
    exit(1)

def preprocess_code(example):
    tokenized = starcoder_tokenizer(
        example["content"],
        truncation=True,
        max_length=ProjectConfig.CODE_MAX_SEQ_LENGTH,
    )
    return tokenized

tokenized_code_dataset = code_dataset.map(
    preprocess_code,
    batched=True,
    remove_columns=code_dataset.column_names,
    desc="Tokenizing code dataset"
)
print("✅ StarCoder2-3B dataset loaded & tokenized!")


# 2. GLUE SST-2 / RoBERTa
print("\n=== Loading GLUE SST-2 (RoBERTa) ===")
roberta_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)

glue_train = load_dataset("glue", "sst2", split="train")
glue_val = load_dataset("glue", "sst2", split="validation")

def preprocess_glue(example):
    result = roberta_tokenizer(
        example["sentence"],
        truncation=True,
        max_length=ProjectConfig.ROBERTA_MAX_SEQ_LENGTH,
        padding="max_length"
    )
    result["labels"] = example["label"]
    return result

tokenized_train = glue_train.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_train.column_names,
    desc="Tokenizing GLUE train split"
)
tokenized_val = glue_val.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_val.column_names,
    desc="Tokenizing GLUE validation split"
)
print("✅ GLUE SST-2 dataset loaded & tokenized!")


# 3. Alpaca-Cleaned / Flan-T5
print("\n=== Loading Alpaca-Cleaned (Flan-T5) ===")
flan_t5_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.FLAN_T5_MODEL)
flan_t5_data = load_dataset("yahma/alpaca-cleaned", split=ProjectConfig.FLAN_SPLIT)

def preprocess_alpaca(example):
    input_text = example.get("input", "")
    prompt = example["instruction"]
    if input_text.strip():
        prompt += "\n" + input_text
    target = example["output"]

    model_inputs = flan_t5_tokenizer(
        prompt,
        truncation=True,
        max_length=ProjectConfig.FLAN_MAX_SEQ_LENGTH,
        padding="max_length"
    )
    labels = flan_t5_tokenizer(
        target,
        truncation=True,
        max_length=ProjectConfig.FLAN_MAX_SEQ_LENGTH,
        padding="max_length"
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

tokenized_alpaca = flan_t5_data.map(
    preprocess_alpaca,
    batched=False,
    remove_columns=flan_t5_data.column_names,
    desc="Tokenizing Alpaca dataset"
)
print("✅ Alpaca-cleaned dataset loaded & tokenized!")


print("\n=== All datasets processed successfully ===")
print("Sample from tokenized code dataset:", tokenized_code_dataset[0])
print("Sample from tokenized GLUE train:", tokenized_train[0])
print("Sample from tokenized Alpaca:", tokenized_alpaca[0])


# This is Training loops

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

class ProjectConfig:
    STARCODER_MODEL = "bigcode/starcoder2-3b"
    STARCODER_OUTPUT = "./starcoder2-finetuned"
    MAX_SEQ_LENGTH = 1024

# Setup starcoder2 model & tokenizer
def setup_starcoder2_model():
    print("Loading StarCoder2-3B model...")
    model = AutoModelForCausalLM.from_pretrained(
        ProjectConfig.STARCODER_MODEL,
        device_map="auto",
        torch_dtype=torch.float16,  # adjust to float32 if GPU memory is tight
    )
    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.STARCODER_MODEL)
    return model, tokenizer

starcoder_model, starcoder_tokenizer = setup_starcoder2_model()


# Op: Apply LoRA (PEFT)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # StarCoder2-3B attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

starcoder_model = get_peft_model(starcoder_model, lora_config)

# Define Training Args
from transformers import TrainingArguments

def get_starcoder_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.STARCODER_OUTPUT,
        per_device_train_batch_size=1,  # adjust to VRAM
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="steps",
        save_steps=200,
        max_steps=1000,  # small number for testing
        optim="paged_adamw_8bit",
        save_total_limit=2,
        report_to="none",
    )

training_args = get_starcoder_training_args()

# Setup SFTTrainer
starcoder_trainer = SFTTrainer(
    model=starcoder_model,
    tokenizer=starcoder_tokenizer,
    train_dataset=tokenized_code_dataset,
    eval_dataset=None,  # add a validation split for real tasks
    peft_config=lora_config,
    dataset_text_field="input_ids",
    max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
    args=training_args
)

# Start Training
print("Starting StarCoder2-3B fine-tuning...")
starcoder_trainer.train()

# Save the fine-tuned model
print(f"Saving fine-tuned model to {ProjectConfig.STARCODER_OUTPUT} ...")
starcoder_model.save_pretrained(ProjectConfig.STARCODER_OUTPUT)
starcoder_tokenizer.save_pretrained(ProjectConfig.STARCODER_OUTPUT)
print("✅ Model saved!")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

class ProjectConfig:
    ROBERTA_MODEL = "roberta-base"
    ROBERTA_MAX_SEQ_LENGTH = 128
    ROBERTA_OUTPUT = "./roberta-sst2-finetuned"
    TRAIN_BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 3


# Setup RoBERTa model & tokenizer
def setup_roberta_model():
    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        ProjectConfig.ROBERTA_MODEL,
        num_labels=2  # SST-2 is binary classification
    )
    return model, tokenizer

roberta_model, roberta_tokenizer = setup_roberta_model()

# ------------------------------
# Training arguments
# ------------------------------
def get_roberta_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.ROBERTA_OUTPUT,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=200,
        save_total_limit=2,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.EVAL_BATCH_SIZE,
        num_train_epochs=ProjectConfig.NUM_EPOCHS,
        weight_decay=0.01,
        logging_steps=50,
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=True,
        report_to="none",
    )

training_args = get_roberta_training_args()

# Opt: data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

# Setup Trainer
roberta_trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=roberta_tokenizer,
    data_collator=data_collator
)

# Train
print("Starting RoBERTa SST-2 fine-tuning...")
roberta_trainer.train()

# Save fine-tuned model
print(f"Saving fine-tuned model to {ProjectConfig.ROBERTA_OUTPUT} ...")
roberta_model.save_pretrained(ProjectConfig.ROBERTA_OUTPUT)
roberta_tokenizer.save_pretrained(ProjectConfig.ROBERTA_OUTPUT)
print("✅ RoBERTa SST-2 model saved!")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

class ProjectConfig:
    FLAN_T5_MODEL = "google/flan-t5-base"
    FLAN_MAX_SEQ_LENGTH = 256
    FLAN_SPLIT = "train[:5%]"
    FLAN_T5_OUTPUT = "./flan-t5-alpaca-finetuned"
    TRAIN_BATCH_SIZE = 4
    EVAL_BATCH_SIZE = 4
    LEARNING_RATE = 5e-5
    NUM_EPOCHS = 3

# Setup Flan-T5 model & tokenizer
def setup_flan_t5_model():
    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.FLAN_T5_MODEL)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ProjectConfig.FLAN_T5_MODEL,
        device_map="auto",
        torch_dtype=torch.float32  # Changed to float32
    )
    return model, tokenizer

flan_t5_model, flan_t5_tokenizer = setup_flan_t5_model()

# Training arguments
def get_flan_t5_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.FLAN_T5_OUTPUT,
        per_device_train_batch_size=ProjectConfig.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.EVAL_BATCH_SIZE,
        num_train_epochs=ProjectConfig.NUM_EPOCHS,
        learning_rate=ProjectConfig.LEARNING_RATE,
        logging_steps=50,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        fp16=False, # Disabled FP16
        # evaluation_strategy="no", # Removed as no eval dataset is provided
        report_to="none",
    )

training_args = get_flan_t5_training_args()

# Data collator for Seq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=flan_t5_tokenizer,
    model=flan_t5_model,
    padding=True
)

# Setup Trainer
flan_trainer = Trainer(
    model=flan_t5_model,
    args=training_args,
    train_dataset=tokenized_alpaca,
    eval_dataset=None,  # can add a small validation split if available
    data_collator=data_collator,
    tokenizer=flan_t5_tokenizer
)

# Train
print("Starting Flan-T5 Alpaca fine-tuning...")
flan_trainer.train()

# Save fine-tuned model
print(f"Saving fine-tuned Flan-T5 model to {ProjectConfig.FLAN_T5_OUTPUT} ...")
flan_t5_model.save_pretrained(ProjectConfig.FLAN_T5_OUTPUT)
flan_t5_tokenizer.save_pretrained(ProjectConfig.FLAN_T5_OUTPUT)
print("✅ Flan-T5 model saved!")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,588 | Num Epochs = 3 | Total steps = 1,941
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 247,577,856 of 247,577,856 (100.00% trained)


Starting Flan-T5 Alpaca fine-tuning...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


In [None]:
# For code, test pass@1 and run a few completions
# For text, use Trainer.evaluate() and print metrics
# Save models to Hugging Face or GDrive