In [1]:
import json
import datasets
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# Load dataset from JSON file
def load_dataset(filename='weather_chatbot_dataset.json'):
    with open(filename, 'r') as f:
        data = json.load(f)
    # Convert list of dictionaries to dictionary of lists
    columns = {key: [dic[key] for dic in data] for key in data[0]}
    return Dataset.from_dict(columns)

# Load the dataset
data = load_dataset()

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
data

Dataset({
    features: ['user_input', 'intent_extraction', 'api_response', 'assistant_response'],
    num_rows: 376
})

In [21]:
data[0]

{'user_input': 'What is the current weather in Warsaw today?',
 'intent_extraction': {'entities': {'city': 'Warsaw', 'date': 'today'},
  'intent': 'current_weather'},
 'api_response': {'date': None,
  'description': 'clear sky',
  'humidity': 65,
  'location': 'Warsaw, PL',
  'temperature': 18.25,
  'wind_speed': 3.6},
 'assistant_response': 'The weather in Warsaw is currently clear sky with a temperature of 18.25°C, wind speed of 3.6 meters per second, and humidity of 65%.'}

In [2]:
# !pip install --upgrade jupyter ipywidgets

In [3]:
# !pip install tensorboard

In [4]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('unsloth/Phi-3-mini-4k-instruct')
model = AutoModelForCausalLM.from_pretrained('unsloth/Phi-3-mini-4k-instruct', trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:07<00:00,  3.67s/it]


In [5]:
# Add adapter configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM"
)

In [6]:
# Get the PEFT model
model = get_peft_model(model, peft_config)

In [7]:
def tokenize_function(examples):
    inputs = examples['user_input']
    responses = examples['assistant_response']
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)
    model_inputs["labels"] = tokenizer(responses, padding="max_length", truncation=True)["input_ids"]
    return model_inputs

tokenized_data = data.map(tokenize_function, batched=True)

Map: 100%|████████████████████████████| 376/376 [00:00<00:00, 451.47 examples/s]


In [8]:
# Prepare data collator
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [9]:
# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=2,  # Reduce batch size
#     save_steps=10_000,
#     save_total_limit=2,
#     prediction_loss_only=True,
#     fp16=False,  # Set to False for CPU training
# )


In [27]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Smaller batch size
    gradient_accumulation_steps=8,  # Simulate a larger batch size
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=False,  # Set to False for CPU training
    # no_cuda=True,
    use_cpu=True,
    logging_dir='./logs',  # TensorBoard log directory
    logging_steps=500,  # Log every 500 steps
    # weight_decay=0.01,  # Apply weight decay
)


In [28]:
#### GPU TRAINING 

# training_args = TrainingArguments(
#     output_dir="./results",
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=1,  # Start with 1 due to limited VRAM
#     save_steps=10_000,
#     save_total_limit=2,
#     prediction_loss_only=True,
#     fp16=True,  # Enable mixed precision
#     logging_dir='./logs',
#     logging_steps=500,
# )


In [29]:
import torch
torch.cuda.empty_cache()

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data,
    eval_dataset=tokenized_data,  # Use the same dataset for evaluation for simplicity
)


In [16]:
# !ssh -T git@hf.co

In [None]:
# Train the model
trainer.train()


In [None]:
# Evaluate the model
results = trainer.evaluate()

print(f"Perplexity: {math.exp(results['eval_loss']):.2f}")


In [None]:
import matplotlib.pyplot as plt

def plot_loss(training_args):
    logs = []
    with open(training_args.logging_dir + '/events.out.tfevents.*', 'r') as f:
        for line in f:
            if 'loss' in line:
                logs.append(json.loads(line))

    steps = [log['step'] for log in logs]
    loss = [log['loss'] for log in logs]

    plt.figure(figsize=(10, 5))
    plt.plot(steps, loss, label='Training Loss')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_loss(training_args)


In [24]:
# !nvidia-smi

In [24]:


# Save the model and tokenizer
# model.save_pretrained('path_to_save_your_model')
# tokenizer.save_pretrained('path_to_save_your_model')
