#### Organize imports

In [None]:
import os
import pandas as pd
import random
from google.colab import drive
from sentence_transformers import InputExample, SentenceTransformer, losses, models
from torch.utils.data import DataLoader

#### Mount Drive

In [None]:
# We've used Google drive for training since this task is computationally expensive for GPU
drive.mount('/content/drive')

#### Load JSON file

In [None]:
json_df = pd.read_json("data/english_courses.json")

print(json_df.columns)
print(json_df['type'].unique())

#### Creating training examples with positive and negative question answer pairs

In [None]:
# Get all possible values for flipping
all_levels = ['Bachelor', 'Diplom', 'Graduate Diploma', 'Full Degree Master', 'Kandidat', 'Master', 'Part Time Master']
all_types = ['Obligatorisk', 'Mandatory', 'Mandatory (also offered as elective)', 'Elective', 'Obligatorisk (også udbudt som valgfag)', 'Valgfag']
all_rows = json_df.to_dict("records")

train_examples = []

for row in all_rows:
    url = row.get("url", "")
    title = row.get("course_title", "")
    ects = row.get("ects", "")
    lang = row.get("language", "")
    level = row.get("level") if pd.notna(row.get("level")) else "No information"
    ctype = row.get("type") if pd.notna(row.get("type")) else "No information"
    desc = row.get("description", "").strip() if pd.notna(row.get("description")) else "No description"

    # --- ECTS ---
    q_ects = f"What is the ECTS value of the course {title}?"
    a_ects_pos = f"{title} is a {ects} ECTS course."
    train_examples.append(InputExample(texts=[q_ects, a_ects_pos], label=1.0))

    # --- Language ---
    q_lang = f"What language is {title} taught in?"
    a_lang_pos = f"{title} is taught in {lang}."
    train_examples.append(InputExample(texts=[q_lang, a_lang_pos], label=1.0))

    # --- Level ---
    q_level = f"What level is the course {title}?"
    a_level_pos = f"{title} is a {level} level course."
    train_examples.append(InputExample(texts=[q_level, a_level_pos], label=1.0))

    # Negative: flip level (only level mismatch)
    # Check if level is not None and not empty
    flipped_levels = [l for l in all_levels if l.lower() != level.lower()]
    if flipped_levels:
        a_level_neg = f"{title} is a {flipped_levels[0]} level course."
        train_examples.append(InputExample(texts=[q_level, a_level_neg], label=0.0))

    # --- Type ---
    q_type = f"Is {title} a mandatory or elective course?"
    a_type_pos = f"{title} is a {ctype} course."
    train_examples.append(InputExample(texts=[q_type, a_type_pos], label=1.0))

    # Negative: flip type (only type mismatch)
    flipped_type = [t for t in all_types if t.lower() != ctype.lower()]
    if flipped_type:
        a_type_neg = f"{title} is a {flipped_type[0]} course."
        train_examples.append(InputExample(texts=[q_type, a_type_neg], label=0.0))

    # --- Description ---
    q_desc = f"What is the description of {title}?"
    a_desc_pos = desc
    train_examples.append(InputExample(texts=[q_desc, a_desc_pos], label=1.0))

    # Negative: use description from another course (description mismatch)
    other_desc = random.choice([r for r in all_rows if r["course_title"] != title])["description"].strip()
    train_examples.append(InputExample(texts=[q_desc, other_desc], label=0.0))


#### Workaround for not using API key for wandb.ai

In [None]:
os.environ["WANDB_DISABLED"] = "true"

#### Defining chosen Bert-like model, dataloader and pooling

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

# Load BERT base model
word_embedding_model = models.Transformer("sentence-transformers/all-MiniLM-L6-v2")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define the loss function
train_loss = losses.ContrastiveLoss(model=model)

#### Train and save the model

In [None]:
output_path = "/content/drive/MyDrive/modernbert-course-finetuned3"

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=61,
    optimizer_params={
        'lr': 2e-5,              # Learning rate for BERT (very important!)
        'eps': 1e-6              
    },
    weight_decay=0.01,           # L2 regularization
    use_amp=True,                # Mixed-precision training (if on GPU, optional)
    output_path=output_path
)