<a href="https://colab.research.google.com/github/RafalW3bCraft/RWC-FinTunna/blob/main/RWC_FinTunna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RWC-FinTunna



# System_Configuretion

In [None]:
import torch
import os

print("🚀 SYSTEM INFORMATION")
print("=" * 50)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("❌ No GPU detected! Please enable GPU in Runtime -> Change runtime type")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging initially

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes xformers datasets -q
!pip install transformers diffusers opencv-python pillow matplotlib scikit-learn pandas numpy -q

print("✅ Installation completed!")

In [None]:
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoModelForImageClassification, AutoImageProcessor,
    AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq,
    BitsAndBytesConfig, pipeline, DataCollatorWithPadding, DataCollatorForLanguageModeling
)
from datasets import Dataset, load_dataset
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from torchvision.transforms.functional import pil_to_tensor
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import
import torch

# Unsloth for efficient training [5][42][99]
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer

# image generation cap
from diffusers import StableDiffusionPipeline, DiffusionPipeline
from PIL import Image

from torch import nn # Import nn for reinitializing the classifier
import os # Import os for directory creation

print("🔧 Setting up project configuration...")

In [None]:
class ProjectConfig:

    # DistilBERT (Text Classification)
    DISTILBERT_MODEL = "distilbert-base-uncased"
    DISTILBERT_MAX_SEQ_LENGTH = 128 # Adjusted for memory
    DISTILBERT_OUTPUT = "./distilbert-finetuned"

    # MobileNetV2 (Image Classification - Placeholder, will need image data later)
    MOBILENET_MODEL = "google/mobilenet_v2_1.0_224"
    IMAGE_SIZE = 224 # Image size for MobileNetV2
    MOBILENET_OUTPUT = "./mobilenetv2-finetuned"

    # STARCODER_MODEL = "bigcode/starcoder2-3b"
    # ROBERTA_MODEL = "roberta-base"
    # FLAN_T5_MODEL = "google/flan-t5-small"

    # Training parameters
    MAX_SEQ_LENGTH = 1024                             # Reduced for T4 memory
    BATCH_SIZE = 8                                    # Conservative for T4
    GRADIENT_ACCUMULATION_STEPS = 1                   # Simplified
    LEARNING_RATE = 5e-5                              # Safe Default
    WARMUP_STEPS = 100
    MAX_STEPS = 500
    NUM_EPOCHS = 3 # Added for clarity

    # LoRA for fine-tuning
    LORA_R = 16
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05

    # Quantization
    USE_4BIT = True
    BNB_4BIT_COMPUTE_DTYPE = torch.bfloat16
    BNB_4BIT_QUANT_TYPE = "nf4"
    BNB_4BIT_USE_DOUBLE_QUANT = True

    OUTPUT_DIR = "./outputs"
    STARCODER_OUTPUT = f"{OUTPUT_DIR}/starcoder2-3b-finetuned"
    ROBERTA_OUTPUT = f"{OUTPUT_DIR}/roberta-finetuned"
    FLAN_T5_OUTPUT = f"{OUTPUT_DIR}/flan-t5-finetuned"

# for dir_path in [ProjectConfig.OUTPUT_DIR, ProjectConfig.STARCODER_OUTPUT, ProjectConfig.ROBERTA_OUTPUT, ProjectConfig.FLAN_T5_OUTPUT]

for dir_path in [ProjectConfig.OUTPUT_DIR, ProjectConfig.DISTILBERT_OUTPUT,
                 ProjectConfig.MOBILENET_OUTPUT]:
    os.makedirs(dir_path, exist_ok=True)

# print(f"  1. Code Generation: {ProjectConfig.STARCODER_MODEL}")
# print(f"  2. Text Classification: {ProjectConfig.ROBERTA_MODEL}")
# print(f"  3. Text-to-Text: {ProjectConfig.FLAN_T5_MODEL}")
print(f"  1. Text Classification: {ProjectConfig.DISTILBERT_MODEL}")
print(f"  2. Image Classification: {ProjectConfig.MOBILENET_MODEL}")

In [None]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        print(f"🔋 GPU Memory - Allocated: {torch.cuda.memory_allocated()/1024**3:.1f}GB")
        print(f"🔋 GPU Memory - Reserved: {torch.cuda.memory_reserved()/1024**3:.1f}GB")
    else:
        print("❌ No GPU available")

def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("🧹 GPU memory cleared")

'''
def setup_quantization_config():
    return BitsAndBytesConfig(
        load_in_4bit=ProjectConfig.USE_4BIT,
        bnb_4bit_compute_dtype=ProjectConfig.BNB_4BIT_COMPUTE_DTYPE,
        bnb_4bit_quant_type=ProjectConfig.BNB_4BIT_QUANT_TYPE,
        bnb_4bit_use_double_quant=ProjectConfig.BNB_4BIT_USE_DOUBLE_QUANT,
    )
'''
print("🛠️ Utility functions ready!")
print_gpu_utilization()

In [None]:
from huggingface_hub import notebook_login
try:
    notebook_login()
    print("✅ Ready to proceed (authentication skipped)")
except Exception as e:
    print("failed continuing without authentication...")


In [None]:
'''
def setup_starcoder2_model():

    clear_memory()

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=ProjectConfig.STARCODER_MODEL,
        max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
        dtype=None,  # Auto-detect
        load_in_4bit=True,
        # token="hf_..." # Uncomment if using private models
    )

    # Add LoRA fine-tuning
    model = FastLanguageModel.get_peft_model(
        model,
        r=ProjectConfig.LORA_R,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        lora_alpha=ProjectConfig.LORA_ALPHA,
        lora_dropout=ProjectConfig.LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    print(f"✅ StarCoder2-3B loaded successfully!")
    print_gpu_utilization()

    return model, tokenizer

starcoder_model, starcoder_tokenizer = setup_starcoder2_model()
print("📝 StarCoder2-3B setup function ready!")
'''

In [None]:
'''
def setup_roberta_model(num_labels=2):

    clear_memory()

    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)

    quantization_config = setup_quantization_config()

    model = AutoModelForSequenceClassification.from_pretrained(
        ProjectConfig.ROBERTA_MODEL,
        num_labels=num_labels,
        quantization_config=quantization_config,
        device_map="auto"
    )

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=ProjectConfig.LORA_R,
        lora_alpha=ProjectConfig.LORA_ALPHA,
        target_modules=["query", "value", "key", "dense"],
        lora_dropout=ProjectConfig.LORA_DROPOUT,
        bias="none",
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    print(f"✅ RoBERTa Base loaded successfully!")
    print_gpu_utilization()

    return model, tokenizer

roberta_model, roberta_tokenizer = setup_roberta_model()
print("📝 RoBERTa Base setup function ready!")
'''

In [None]:
# Remove LoRA and complex quantization to save memory.

def setup_distilbert_model(num_labels=2):
    clear_memory() # Clear memory before loading a new model
    print("Loading DistilBERT model...")
    tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.DISTILBERT_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        ProjectConfig.DISTILBERT_MODEL,
        num_labels=num_labels,
        # No quantization or LoRA applied here to reduce memory footprint.
    )
    print(f"✅ DistilBERT Base loaded successfully with {num_labels} labels!")
    print_gpu_utilization() # Print GPU utilization after loading
    return model, tokenizer

distilbert_model, distilbert_tokenizer = setup_distilbert_model(num_labels=2)
print("📝 New model setup functions ready and models loaded!")

In [None]:

def setup_mobilenet_model(num_labels=10): # Assuming 10 labels for a dataset like CIFAR-10
    clear_memory()
    print("Loading MobileNetV2 model...")
    processor = AutoImageProcessor.from_pretrained(ProjectConfig.MOBILENET_MODEL)
    model = AutoModelForImageClassification.from_pretrained(
        ProjectConfig.MOBILENET_MODEL,
        ignore_mismatched_sizes=True,
        # No quantization or LoRA applied here for memory efficiency.
    )
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Linear(num_ftrs, num_labels)
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

mobilenet_model, mobilenet_processor = setup_mobilenet_model(num_labels=10)
print("📝 New model setup functions ready and models loaded!")

In [None]:
print(f"\n💾 Output directories created:")
for output_dir in [ProjectConfig.STARCODER_OUTPUT, ProjectConfig.ROBERTA_OUTPUT, ProjectConfig.FLAN_T5_OUTPUT]:
    print(f"  • {output_dir}")

print("\n🔄 Ready to begin Phase One implementation!")
print_gpu_utilization()


# **1️⃣ Code Generation / Programming Datasets (17+)**

| # | Dataset | Description | Notes / Language | Link |
|---|---------|------------|-----------------|------|
| 1 | `codeparrot/codeparrot-clean` | Clean Python GitHub code | Python | [Link](https://huggingface.co/datasets/codeparrot/codeparrot-clean) |
| 2 | `bigcode/the-stack` | Large multi-language code corpus | Multi-language | [Link](https://huggingface.co/datasets/bigcode/the-stack) |
| 3 | `bigcode/the-stack-smol` | Small subset for testing | Multi-language | [Link](https://huggingface.co/datasets/bigcode/the-stack-smol) |
| 4 | `codeparrot/github-jupyter-text` | Jupyter notebooks, code + markdown | Python | [Link](https://huggingface.co/datasets/codeparrot/github-jupyter-text) |
| 5 | `codeparrot/cleaned-java` | Cleaned Java code | Java | [Link](https://huggingface.co/datasets/codeparrot/cleaned-java) |
| 6 | `codeparrot/cleaned-cpp` | Cleaned C++ code | C++ | [Link](https://huggingface.co/datasets/codeparrot/cleaned-cpp) |
| 7 | `codeparrot/cleaned-javascript` | Cleaned JS code | JavaScript | [Link](https://huggingface.co/datasets/codeparrot/cleaned-javascript) |
| 8 | `HuggingFace Code Dataset hub` | Collection of code datasets | Multiple | [Link](https://huggingface.co/datasets?search=code) |
| 9 | `CodeSearchNet` | Code + docstring pairs for search | Python, Java, Go, etc. | [Link](https://huggingface.co/datasets/code_search_net) |
| 10 | `CodeContests` | Competitive programming solutions | Python, C++ | [Link](https://huggingface.co/datasets/codecontests) |
| 11 | `HackerRank` | Code challenges and solutions | Python, Java, C++ | [Link](https://www.kaggle.com/datasets/anishathalye/hackerrank-solutions) |
| 12 | `Funcom` | Code summarization dataset | Python | [Link](https://huggingface.co/datasets/funcom) |
| 13 | `CoNaLa` | Code/Natural Language dataset | Python | [Link](https://huggingface.co/datasets/conala) |
| 14 | `Java-Large` | Large Java dataset | Java | [Link](https://huggingface.co/datasets/mbpp) |
| 15 | `CodeXGLUE` | Multiple code tasks (translation, repair, summarization) | Multi-language | [Link](https://github.com/microsoft/CodeXGLUE) |
| 16 | `MBPP` | Python code + natural language | Python | [Link](https://huggingface.co/datasets/mbpp) |
| 17 | `CodeNet` | Large-scale multi-language code dataset | Multi-language | [Link](https://huggingface.co/datasets/codexglue) |

---

# **2️⃣ NLP / Text Classification / Sentiment Datasets (17+)**

| # | Dataset | Description | Notes | Link |
|---|---------|------------|------|------|
| 1 | `glue/sst2` | Stanford Sentiment Treebank | Binary sentiment | [Link](https://huggingface.co/datasets/glue/viewer/sst2) |
| 2 | IMDB | Movie reviews, binary sentiment | English | [Link](https://huggingface.co/datasets/imdb) |
| 3 | Amazon Reviews | Multi-domain reviews | Multi-class ratings | [Link](https://huggingface.co/datasets/amazon_reviews_multi) |
| 4 | Yelp Reviews | User reviews, multi-class | English | [Link](https://huggingface.co/datasets/yelp_review_full) |
| 5 | Emotion | Multi-class emotion labels | anger, joy, sadness, etc. | [Link](https://huggingface.co/datasets/emotion) |
| 6 | TweetEval | Twitter sentiment | Short informal texts | [Link](https://huggingface.co/datasets/tweeteval) |
| 7 | AG News | News classification | 4 classes | [Link](https://huggingface.co/datasets/ag_news) |
| 8 | DBpedia | Wikipedia text classification | 14 classes | [Link](https://huggingface.co/datasets/dbpedia_14) |
| 9 | 20 Newsgroups | Topic classification | 20 classes | [Link](https://huggingface.co/datasets/20_newsgroups) |
| 10 | CoLA | Grammatical acceptability | Binary | [Link](https://huggingface.co/datasets/glue/viewer/cola) |
| 11 | TREC | Question classification | 6 classes | [Link](https://huggingface.co/datasets/trec) |
| 12 | Financial PhraseBank | Financial sentiment | Positive, negative, neutral | [Link](https://huggingface.co/datasets/financial_phrasebank) |
| 13 | Toxic Comment Classification | Multi-label toxicity | English | [Link](https://huggingface.co/datasets/jigsaw_toxicity_pred) |
| 14 | Multi-Domain Sentiment Dataset (MDSD) | Multi-domain reviews | English | [Link](https://huggingface.co/datasets/mdsd) |
| 15 | Yelp Polarity | Binary sentiment | English | [Link](https://huggingface.co/datasets/yelp_polarity) |
| 16 | Amazon Polarity | Binary sentiment | English | [Link](https://huggingface.co/datasets/amazon_polarity) |
| 17 | SMS Spam Collection | Spam detection | English | [Link](https://huggingface.co/datasets/sms_spam) |

---

# **3️⃣ Instruction-Tuning / Instruction-Following Datasets (17+)**

| # | Dataset | Description | Notes | Link |
|---|---------|------------|------|------|
| 1 | `yahma/alpaca-cleaned` | Instruction + output | 52k examples | [Link](https://huggingface.co/datasets/yahma/alpaca-cleaned) |
| 2 | Stanford Self-Instruct | Multi-domain instructions | 175k+ | [Link](https://huggingface.co/datasets/StanfordAI/self_instruct) |
| 3 | OpenAssistant/oasst1 | Open-source ChatGPT-like instructions | 160k+ | [Link](https://huggingface.co/datasets/OpenAssistant/oasst1) |
| 4 | ShareGPT | Conversations scraped from ChatGPT | Multi-turn dialogue | [Link](https://huggingface.co/datasets/ShareGPT) |
| 5 | Dolly 2.0 | Instruction + output | 15k examples | [Link](https://huggingface.co/datasets/databricks/dolly_2_0) |
| 6 | WizardLM | Instruction + reasoning dataset | 100k+ | [Link](https://huggingface.co/datasets/anon8231489123/wizardlm_unfiltered) |
| 7 | Alpaca-GPT4 | GPT-4 generated instructions | 52k examples | [Link](https://huggingface.co/datasets/tatsu-lab/alpaca_gpt4) |
| 8 | Evol-Instruct | Evolutionary instruction dataset | 100k+ | [Link](https://huggingface.co/datasets/evol-instruct) |
| 9 | FLAN v2 | Google’s FLAN instruction dataset | Multi-task | [Link](https://huggingface.co/datasets/google/flan_v2) |
| 10 | Tulu | Multilingual instruction dataset | 80k+ | [Link](https://huggingface.co/datasets/tulu) |
| 11 | CoT-Instruct | Chain-of-thought instructions | Reasoning tasks | [Link](https://huggingface.co/datasets/cot_instruct) |
| 12 | Super-NaturalInstructions | 1,600+ diverse tasks | Multi-task | [Link](https://huggingface.co/datasets/super_natural_instructions) |
| 13 | MPT-Instruct | MosaicML instruction dataset | Multi-domain | [Link](https://huggingface.co/datasets/mpt_instruct) |
| 14 | Vicuna Dataset | 70k+ instruction-response pairs | Open-source ChatGPT-like | [Link](https://huggingface.co/datasets/vicuna_dataset) |
| 15 | OpenInstruct | 100k+ instructions | Multi-domain | [Link](https://huggingface.co/datasets/openinstruct) |
| 16 | LIMA | Fine-tuning dataset for instruction-following | ~1k high-quality examples | [Link](https://github.com/StanfordAI/LIMA) |
| 17 | Koala | Instruction dataset | ~100k examples | [Link](https://github.com/karpathy/koala) |

---


# Dataset Preprocessing

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    STARCODER_MODEL = "bigcode/starcoder2-3b"
    MAX_SEQ_LENGTH = 1024   # adjust per your VRAM (e.g., 512, 2048)
    DATASET_SPLIT = "train[:1%]"  # smaller split for testing

print("Loading dataset...")
code_dataset = load_dataset("codeparrot/codeparrot-clean", split=ProjectConfig.DATASET_SPLIT)

print("Loading tokenizer...")
starcoder_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.STARCODER_MODEL)

def preprocess_code(example):
    code_sample = example["content"]

    tokenized = starcoder_tokenizer(
        code_sample,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
    )

    return tokenized


print("Tokenizing dataset...")
tokenized_code_dataset = code_dataset.map(
    preprocess_code,
    remove_columns=code_dataset.column_names,
    batched=True,
    num_proc=4
)

print("✅ StarCoder2-3B dataset loaded & tokenized!")
print(tokenized_code_dataset)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    ROBERTA_MODEL = "roberta-base"
    MAX_SEQ_LENGTH = 128

print("Loading tokenizer...")
roberta_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.ROBERTA_MODEL)

print("Loading GLUE SST-2 dataset...")
glue_train = load_dataset("glue", "sst2", split="train")
glue_val = load_dataset("glue", "sst2", split="validation")

def preprocess_glue(example):
    return roberta_tokenizer(
        example["sentence"],
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",
    )


print("Tokenizing datasets...")
tokenized_train = glue_train.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_train.column_names,
    desc="Tokenizing train split"
)

tokenized_val = glue_val.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_val.column_names,
    desc="Tokenizing validation split"
)

print("✅ RoBERTa Base dataset loaded & tokenized!")
print(tokenized_train)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

class ProjectConfig:
    FLAN_T5_MODEL = "google/flan-t5-base"
    MAX_SEQ_LENGTH = 256
    DATASET_SPLIT = "train[:5%]"  # small slice for testing

print("Loading tokenizer...")
flan_t5_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.FLAN_T5_MODEL)

print("Loading Alpaca-cleaned dataset...")
flan_t5_data = load_dataset("yahma/alpaca-cleaned", split=ProjectConfig.DATASET_SPLIT)

def preprocess_alpaca(example):
    input_text = example.get("input", "")
    prompt = example["instruction"]
    if input_text.strip():
        prompt += "\n" + input_text

    target = example["output"]

    model_inputs = flan_t5_tokenizer(
        prompt,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",
    )

    labels = flan_t5_tokenizer(
        target,
        truncation=True,
        max_length=ProjectConfig.MAX_SEQ_LENGTH,
        padding="max_length",
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

print("Tokenizing Alpaca dataset...")
tokenized_alpaca = flan_t5_data.map(
    preprocess_alpaca,
    batched=False,
    remove_columns=flan_t5_data.column_names,
    desc="Tokenizing Alpaca"
)

print("✅ Flan-T5 dataset loaded & tokenized!")
print(tokenized_alpaca[0])


In [None]:
print("\n=== Loading GLUE SST-2 (DistilBERT) ===")
distilbert_tokenizer = AutoTokenizer.from_pretrained(ProjectConfig.DISTILBERT_MODEL)

try:
    # Load the dataset
    glue_train = load_dataset("glue", "sst2", split="train")
    glue_val = load_dataset("glue", "sst2", split="validation")
    print("GLUE SST-2 dataset loaded.")
except Exception as e:
    print("Error loading GLUE SST-2 dataset:", e)

def preprocess_glue(example):
    result = distilbert_tokenizer(
        example["sentence"],
        truncation=True,
        max_length=ProjectConfig.DISTILBERT_MAX_SEQ_LENGTH,
        padding="max_length"
    )

    result["labels"] = example["label"]
    return result

print("Tokenizing GLUE SST-2 datasets...")
tokenized_train = glue_train.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_train.column_names, # Remove COLIUMS
    desc="Tokenizing GLUE train split"
)
tokenized_val = glue_val.map(
    preprocess_glue,
    batched=True,
    remove_columns=glue_val.column_names,
    desc="Tokenizing GLUE validation split"
)
print("\nSample from tokenized GLUE train (DistilBERT):")
print(tokenized_train[0])

In [None]:
print("\n=== Loading CIFAR-10 (MobileNetV2) ===")
mobilenet_processor = AutoImageProcessor.from_pretrained(ProjectConfig.MOBILENET_MODEL)

try:
    # Load the image dataset
    cifar10_train = load_dataset("cifar10", split="train")
    cifar10_val = load_dataset("cifar10", split="test") # CIFAR-10 uses 'test' for validation/evaluation
    print("CIFAR-10 dataset loaded.")
except Exception as e:
    print("Error loading CIFAR-10 dataset:", e)

normalize = Normalize(mean=mobilenet_processor.image_mean, std=mobilenet_processor.image_std)
# Compose transformations: resize, convert to tensor, normalize.
_transforms = Compose([
    RandomResizedCrop(ProjectConfig.IMAGE_SIZE),
    ToTensor(),
    normalize,
])

def preprocess_image(example):
    example["pixel_values"] = [_transforms(image.convert("RGB")) for image in example["img"]]
    # Include the original label for training.
    example["labels"] = example["label"]
    return example



print("Processing CIFAR-10 datasets...")
processed_cifar10_train = cifar10_train.map(
    preprocess_image,
    batched=True,
    remove_columns=cifar10_train.column_names,
    desc="Processing CIFAR-10 train split"
)


processed_cifar10_val = cifar10_val.map(
    preprocess_image,
    batched=True,
    remove_columns=cifar10_val.column_names,
    desc="Processing CIFAR-10 validation split"
)
print("✅ CIFAR-10 dataset loaded & processed for MobileNetV2!")

print("\nSample from processed CIFAR-10 train (MobileNetV2):")
print(processed_cifar10_train[0])


# Training Arguments

In [None]:
'''
def get_starcoder_training_args():
    return SFTConfig(
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        max_steps=ProjectConfig.MAX_STEPS,
        learning_rate=ProjectConfig.LEARNING_RATE,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=ProjectConfig.STARCODER_OUTPUT,
        max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
    )

'''

In [None]:
'''
def get_roberta_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.ROBERTA_OUTPUT,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        logging_steps=10,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        dataloader_pin_memory=False,
        remove_unused_columns=False,
    )
'''

In [None]:
'''
def get_flan_t5_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.FLAN_T5_OUTPUT,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        logging_steps=10,
        warmup_steps=ProjectConfig.WARMUP_STEPS,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        predict_with_generate=True,
        dataloader_pin_memory=False,
    )
'''

In [None]:
def get_distilbert_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.DISTILBERT_OUTPUT,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        num_train_epochs=ProjectConfig.NUM_EPOCHS,
        weight_decay=0.01,
        logging_steps=50,
        logging_dir="./logs",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 7,
        report_to="none",
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
    )

In [None]:
def get_mobilenet_training_args():
    return TrainingArguments(
        output_dir=ProjectConfig.MOBILENET_OUTPUT,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        learning_rate=ProjectConfig.LEARNING_RATE,
        per_device_train_batch_size=ProjectConfig.BATCH_SIZE,
        per_device_eval_batch_size=ProjectConfig.BATCH_SIZE,
        num_train_epochs=ProjectConfig.NUM_EPOCHS,
        weight_decay=0.01,
        logging_steps=50,
        logging_dir="./logs", # Directory for logs
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 7,
        report_to="none",
        remove_unused_columns=False,
        gradient_accumulation_steps=ProjectConfig.GRADIENT_ACCUMULATION_STEPS,
    )

# Training loops

In [None]:
distilbert_trainer = Trainer(
    model=distilbert_model, # Loaded DistilBERT model
    args=get_distilbert_training_args(), # DistilBERT training arguments
    train_dataset=tokenized_train, # Tokenized GLUE train dataset
    eval_dataset=tokenized_val, # Tokenized GLUE validation dataset
    tokenizer=distilbert_tokenizer, # Tokenizer for padding
    data_collator=DataCollatorWithPadding(tokenizer=distilbert_tokenizer)
)

print("DistilBERT fine-tuning in progress...")
try:
    distilbert_trainer.train()
    print("DistilBERT training completed.")
except Exception as e:
    print(f"An error occurred during DistilBERT training: {e}")

print(f"Saving fine-tuned DistilBERT model to {ProjectConfig.DISTILBERT_OUTPUT} ...")
os.makedirs(ProjectConfig.DISTILBERT_OUTPUT, exist_ok=True)
distilbert_model.save_pretrained(ProjectConfig.DISTILBERT_OUTPUT)
distilbert_tokenizer.save_pretrained(ProjectConfig.DISTILBERT_OUTPUT)
print("✅ DistilBERT model saved!")

In [None]:
mobilenet_trainer = Trainer(
    model=mobilenet_model, # Loaded MobileNetV2 model
    args=get_mobilenet_training_args(), # MobileNetV2 training arguments
    train_dataset=processed_cifar10_train, # Processed CIFAR-10 train dataset
    eval_dataset=processed_cifar10_val, # Processed CIFAR-10 validation dataset
)

print("MobileNetV2 fine-tuning in progress...")
try:
    mobilenet_trainer.train()
    print("MobileNetV2 training completed.")
except Exception as e:
    print(f"An error occurred during MobileNetV2 training: {e}")

print(f"Saving fine-tuned MobileNetV2 model to {ProjectConfig.MOBILENET_OUTPUT} ...") # Image models typically use a processor instead of a tokenizer.
os.makedirs(ProjectConfig.MOBILENET_OUTPUT, exist_ok=True)
mobilenet_model.save_pretrained(ProjectConfig.MOBILENET_OUTPUT)
mobilenet_processor.save_pretrained(ProjectConfig.MOBILENET_OUTPUT)
print("✅ MobileNetV2 model saved!")
print("\nTraining loops for MobileNetV2 completed.")

In [None]:
'''
starcoder_trainer = SFTTrainer(
    model=starcoder_model,
    tokenizer=starcoder_tokenizer,
    train_dataset=tokenized_code_dataset,
    eval_dataset=None,  # add a validation split for real tasks
    peft_config=lora_config,
    dataset_text_field="input_ids",
    max_seq_length=ProjectConfig.MAX_SEQ_LENGTH,
    args=training_args
)

print("Starting StarCoder2-3B fine-tuning...")
starcoder_trainer.train()

# Save the fine-tuned model
print(f"Saving fine-tuned model to {ProjectConfig.STARCODER_OUTPUT} ...")
starcoder_model.save_pretrained(ProjectConfig.STARCODER_OUTPUT)
starcoder_tokenizer.save_pretrained(ProjectConfig.STARCODER_OUTPUT)
print("✅ Model saved!")
print("\nTraining loops for starcoder completed.")
'''

In [None]:
'''
roberta_trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=roberta_tokenizer,
    data_collator=data_collator
)

print("Starting RoBERTa SST-2 fine-tuning...")
roberta_trainer.train()

print(f"Saving fine-tuned model to {ProjectConfig.ROBERTA_OUTPUT} ...")
roberta_model.save_pretrained(ProjectConfig.ROBERTA_OUTPUT)
roberta_tokenizer.save_pretrained(ProjectConfig.ROBERTA_OUTPUT)
print("✅ RoBERTa SST-2 model saved!")
print("\nTraining loops for RoBERTa SST-2 completed.")
'''

In [None]:
'''
flan_trainer = Trainer(
    model=flan_t5_model,
    args=training_args,
    train_dataset=tokenized_alpaca,
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=flan_t5_tokenizer
)

# Train
print("Starting Flan-T5 Alpaca fine-tuning...")
flan_trainer.train()

print(f"Saving fine-tuned Flan-T5 model to {ProjectConfig.FLAN_T5_OUTPUT} ...")
flan_t5_model.save_pretrained(ProjectConfig.FLAN_T5_OUTPUT)
flan_t5_tokenizer.save_pretrained(ProjectConfig.FLAN_T5_OUTPUT)
print("✅ Flan-T5 model saved!")
print("\nTraining loops for Flan-T5 completed.")
'''

# Task


In [None]:

print_gpu_utilization()
