<a href="https://colab.research.google.com/github/Steve-Falkovsky/Hypencoder-Entity-Linking/blob/Professional-Structure/notebooks/fine_tune_Hypencoder_on_BC5CDR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import importlib.util

# for syncing changes to notebook when changing something in VScode
# deprecated/need to update to new IPython which breaks colab
# %load_ext autoreload
# %autoreload 2



REPO_NAME = "Hypencoder-Entity-Linking"
GIT_URL = f"https://github.com/Steve-Falkovsky/{REPO_NAME}.git"
BRANCH_NAME = "Professional-Structure"



# --- COLAB SETUP ---
is_colab = importlib.util.find_spec("google.colab") is not None
if is_colab:
    print("‚òÅÔ∏è Running in Colab...")
    if not os.path.exists(REPO_NAME):
        !git clone -b {BRANCH_NAME} --single-branch {GIT_URL}

    # Move into the downloaded repo (The Root)
    os.chdir(REPO_NAME)



# --- LOCAL SETUP ---
else:
    print("üíª Running Locally...")
    if os.path.basename(os.getcwd()) == "notebooks":
        os.chdir("..")


%pip install -q -e "./hypencoder-paper"

os.chdir("./hypencoder-paper")
print(f"üìç Working Directory is now: {os.getcwd()}")
print("‚úÖ Environment Ready!")

In [None]:
# loading the data
from datasets import load_dataset

dataset = load_dataset("Stevenf232/hypencoder_contrastiveLoss_nameOnly")
train_data = dataset['train']
val_data = dataset['validation']

In [None]:
# saving the data to a file
train_data.to_json('data/train.jsonl', lines=True)
val_data.to_json('data/val.jsonl', lines=True)

In [None]:
# tokenizing the data before training

# training
!python hypencoder_cb/utils/tokenizer_utils.py \
--standard_format_jsonl='data/train.jsonl' \
--output_file='data/train_tokenized.jsonl' \
--tokenizer="google-bert/bert-base-uncased" \
--add_special_tokens=True \
--query_max_length=32 \
--item_max_length=512

# validation
!python hypencoder_cb/utils/tokenizer_utils.py \
--standard_format_jsonl='data/val.jsonl' \
--output_file='data/val_tokenized.jsonl' \
--tokenizer="google-bert/bert-base-uncased" \
--add_special_tokens=True \
--query_max_length=32 \
--item_max_length=512

Everything in the output above is [00:00] which seems quite suspicious! (or it could just be really fast)



---



# Training the hypencoder

In [None]:
!python hypencoder_cb/train/train.py hypencoder_cb/train/configs/hypencoder.2_layer_finetuned_BC5CDR.yaml

In [None]:
%pip install huggingface_hub

# push the model to HuggingFace
# the model was saved in the model directory

from huggingface_hub import upload_folder

# Path where the model files are saved in Colab
# check and change this based on which checkpoint you got
local_folder_path = "/content/Hypencoder-Entity-Linking/hypencoder-paper/model/hypencoder.2_layer_finetuned_BC5CDR/checkpoint-80"

In [None]:


# Your desired repository ID on Hugging Face (e.g., "your-username/my-generic-model")
repo_id = "Stevenf232/hypencoder_BC5CDR"

# You may need to create the repository first if it doesn't exist
from huggingface_hub import create_repo
create_repo(repo_id, exist_ok=True)

upload_folder(
    folder_path=local_folder_path,
    repo_id=repo_id,
    repo_type="model", # or "dataset" or "space"
    commit_message="Upload trained model from Colab"
)