In [1]:
import sys
import os
import yaml
from datetime import datetime
import re 

snapshot_date = datetime.now().strftime("%Y-%m-%d")

sys.path.append(os.path.abspath(os.path.join('..')))
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))

with open('./llama-fc-wo-descriptions_config.yaml') as f:
    d = yaml.load(f, Loader=yaml.FullLoader)
    
AZURE_SUBSCRIPTION_ID = d['config']['AZURE_SUBSCRIPTION_ID']
AZURE_RESOURCE_GROUP = d['config']['AZURE_RESOURCE_GROUP']
AZURE_WORKSPACE = d['config']['AZURE_WORKSPACE']
AZURE_DATA_NAME = d['config']['AZURE_SFT_DATA_NAME']    
DATA_DIR = d['config']['SFT_DATA_DIR']
CLOUD_DIR = d['config']['CLOUD_DIR']
HF_MODEL_NAME_OR_PATH = d['config']['HF_MODEL_NAME_OR_PATH']
IS_DEBUG = d['config']['IS_DEBUG']
USE_LOWPRIORITY_VM = d['config']['USE_LOWPRIORITY_VM']

azure_env_name = d['train']['azure_env_name']  
azure_compute_cluster_name = d['train']['azure_compute_cluster_name']
azure_compute_cluster_size = d['train']['azure_compute_cluster_size']

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CLOUD_DIR, exist_ok=True)

In [2]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create console handler with a higher log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create file handler which logs even debug messages
file_handler = logging.FileHandler("debug.log")
file_handler.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

In [3]:
logger.info("===== 0. Azure ML Training Info =====")
logger.info(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
logger.info(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
logger.info(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
logger.info(f"AZURE_DATA_NAME={AZURE_DATA_NAME}")
logger.info(f"DATA_DIR={DATA_DIR}")
logger.info(f"CLOUD_DIR={CLOUD_DIR}")
logger.info(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")
logger.info(f"IS_DEBUG={IS_DEBUG}")
logger.info(f"USE_LOWPRIORITY_VM={USE_LOWPRIORITY_VM}")
logger.info(f"azure_env_name={azure_env_name}")
logger.info(f"azure_compute_cluster_name={azure_compute_cluster_name}")
logger.info(f"azure_compute_cluster_size={azure_compute_cluster_size}")

2025-01-03 10:38:13,032 - __main__ - INFO - ===== 0. Azure ML Training Info =====
2025-01-03 10:38:13,033 - __main__ - INFO - AZURE_SUBSCRIPTION_ID=8cebb108-a4d5-402b-a0c4-f7556126277f
2025-01-03 10:38:13,034 - __main__ - INFO - AZURE_RESOURCE_GROUP=azure-ml-priya-demo
2025-01-03 10:38:13,036 - __main__ - INFO - AZURE_WORKSPACE=azure-ml-priya-westus3
2025-01-03 10:38:13,037 - __main__ - INFO - AZURE_DATA_NAME=sft-data-function-call-wo-desc
2025-01-03 10:38:13,038 - __main__ - INFO - DATA_DIR=./dataset_wo_desc
2025-01-03 10:38:13,039 - __main__ - INFO - CLOUD_DIR=./cloud
2025-01-03 10:38:13,040 - __main__ - INFO - HF_MODEL_NAME_OR_PATH=unsloth/Llama-3.2-3B-Instruct
2025-01-03 10:38:13,041 - __main__ - INFO - IS_DEBUG=True
2025-01-03 10:38:13,042 - __main__ - INFO - USE_LOWPRIORITY_VM=False
2025-01-03 10:38:13,043 - __main__ - INFO - azure_env_name=slm-llama-acft-custom-env
2025-01-03 10:38:13,044 - __main__ - INFO - azure_compute_cluster_name=gpu-a100-demo-vm
2025-01-03 10:38:13,045 - _

### 2. Training Preparation
#### 2.1 Configure Workspace Details
To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.

In [4]:
# import required libraries
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component
from azure.ai.ml import command
from azure.ai.ml.entities import Data, Environment, BuildContext
from azure.ai.ml.entities import Model
from azure.ai.ml import Input
from azure.ai.ml import Output
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError

credential = DefaultAzureCredential()
ml_client = None
try:
    ml_client = MLClient.from_config(credential)
except Exception as ex:
    print(ex)
    ml_client = MLClient(credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE)

Found the config file in: /config.json


#### 2.2. Load and prepare the dataset
#####
Training data can be used as a dataset stored in the local development environment, but can also be registered as AzureML data. For this hands-on session, we will register the data as AzureML Data asset and will use the registered dataset for training and inference

In [5]:
def remove_desc_from_prompts(data):
    system_message = data['system']
    pattern = r'"description":\s*"[^"]*",?\n?'  
    
    # Remove the "description" fields  
    cleaned_string = re.sub(pattern, '"description":"",', system_message)  

    return cleaned_string

In [6]:
from datasets import load_from_disk
import json
import ast

train_dataset = load_from_disk(f"{DATA_DIR}/train")
test_dataset = load_from_disk(f"{DATA_DIR}/test")
val_dataset = load_from_disk(f"{DATA_DIR}/val")

## Update the system message by removing function descriptions and argument description
train_dataset = train_dataset.map(lambda x : {"updated_system" : remove_desc_from_prompts(x)}, remove_columns = ["system"])
test_dataset = test_dataset.map(lambda x : {"updated_system" : remove_desc_from_prompts(x)}, remove_columns = ["system"])
val_dataset = val_dataset.map(lambda x : {"updated_system" : remove_desc_from_prompts(x)}, remove_columns = ["system"])

train_dataset.save_to_disk(f"{DATA_DIR}/train")
test_dataset.save_to_disk(f"{DATA_DIR}/test")
val_dataset.save_to_disk("f"{DATA_DIR}/val")

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: Directory ./dataset_wo_desc/train not found

In [None]:
import re
from datasets import load_dataset, load_from_disk

def get_or_create_data_asset(ml_client, data_name, data_local_dir, update=False):
    
    try:
        latest_data_version = max([int(d.version) for d in ml_client.data.list(name=data_name)])
        if update:
            raise ResourceExistsError('Found Data asset, but will update the Data.')            
        else:
            data_asset = ml_client.data.get(name=data_name, version=latest_data_version)
            print(f"Found Data asset: {data_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        data = Data(
            path=data_local_dir,
            type=AssetTypes.URI_FOLDER,
            description=f"{data_name} for fine tuning",
            tags={"FineTuningType": "Instruction", "Language": "En"},
            name=data_name
        )
        data_asset = ml_client.data.create_or_update(data)
        print(f"Created Data asset: {data_name}")
        
    return data_asset