In [1]:
import pandas as pd
path = r"/teamspace/studios/this_studio/AI-Agent-Diabetes-Diagnosis/Datasets/diabetes_unified.jsonl"
data = pd.read_json(path, lines=True)

In [2]:
FT_data = []
for _, row in data.iterrows():
    FT_data.append({
        "instruction": "Summarize the patient's health metrics for the given day.",
        "input": row["input"],       
        "output": row["output"]  
    })

In [4]:
import json

output_path = r"/teamspace/studios/this_studio/AI-Agent-Diabetes-Diagnosis/Datasets/fine-tuning/ftData.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for record in FT_data:
        f.write(json.dumps(record) + "\n")

In [5]:
import json

ft_dataset_path = r"/teamspace/studios/this_studio/AI-Agent-Diabetes-Diagnosis/Datasets/fine-tuning/ftData.jsonl"

tokenizedDataset = []
with open(ft_dataset_path, "r", encoding="utf-8") as f:
    for line in f:
        tokenizedDataset.append(json.loads(line))

print(tokenizedDataset[0])


{'instruction': "Summarize the patient's health metrics for the given day.", 'input': 'Patient: HUPA0026P | Date: 2020-05-23\nGlucose (avg/min/max): 106.9/40.0/167.0, Heart rate avg: 86.1, Steps: 20679.0, Carbs: 0.0, Insulin: 8.0, Events: Meals=0, Activities=137, Hypoglycemia=54, Hyperglycemia=0', 'output': 'On 2020-05-23, patient HUPA0026P had an average glucose of 106.9 (range 40.0–167.0), average heart rate 86.1 bpm, 20679.0 steps, 0.0 carbs, 8.0 insulin units, with 0 meals, 137 activity periods, 54 hypoglycemia, and 0 hyperglycemia events.'}


In [6]:
from huggingface_hub import login
from transformers import AutoTokenizer
import json
from datasets import load_dataset

# Login to Hugging Face
login(token="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")  

modelID = "mistralai/Mistral-7B-v0.1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(modelID, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

# Output file
output_file = "/teamspace/studios/this_studio/AI-Agent-Diabetes-Diagnosis/Datasets/fine-tuning/tokenized/tokenizedData.jsonl"

# Tokenize each record individually, without truncation
with open(output_file, "w", encoding="utf-8") as f:
    for row in tokenizedDataset:  
        # Combine instruction and input naturally
        text = row["instruction"].strip() + "\n" + row["input"].strip()
        
        # Tokenize without truncation to keep full record
        tokens = tokenizer(
            text,
            add_special_tokens=True,
            padding=False,
            truncation=False,
            return_attention_mask=True
        )
        
        record = {
            "instruction": row["instruction"],
            "input": row["input"],
            "text": text,
            "input_ids": tokens["input_ids"],
            "attention_mask": tokens["attention_mask"]
        }

        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Tokenized data saved to {output_file}")

# Load dataset for training
tokenizedDataset = load_dataset("json", data_files=output_file, split="train")




Tokenized data saved to /teamspace/studios/this_studio/AI-Agent-Diabetes-Diagnosis/Datasets/fine-tuning/tokenized/tokenizedData.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
import random

# Load tokenizer and model
modelID = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(modelID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(modelID)

In [8]:
# Use CPU
device = torch.device("cpu")
model.to(device)

# Sample 1000 records or fewer if dataset is smaller
sample_indices = random.sample(range(len(tokenizedDataset)), min(50000, len(tokenizedDataset)))
sampled_dataset = [tokenizedDataset[i] for i in sample_indices]  # FIXED

# Tokenize texts for input_ids
def tokenize_fn(batch):
    text = batch["instruction"] + "\n" + batch["input"]
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized_samples = [tokenize_fn(sample) for sample in sampled_dataset]

# Prepare DataLoader
class Dataset(torch.utils.data.Dataset):
    def __init__(self, samples):
        self.samples = samples
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v) for k, v in self.samples[idx].items() if k in ["input_ids", "attention_mask"]}
        item["labels"] = item["input_ids"].clone()  # causal LM labels
        return item


In [None]:
train_loader = DataLoader(Dataset(tokenized_samples), batch_size=1, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


# Training loop (1 epoch for testing)
model.train()
for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    print("Loss:", loss.item())

Loss: 0.04445143789052963
Loss: 0.03474070131778717
Loss: 0.06205904111266136
Loss: 0.03845411166548729
Loss: 0.04780970886349678
Loss: 0.04442743584513664
Loss: 0.03751774877309799
Loss: 0.034513942897319794
Loss: 0.06052820011973381
Loss: 0.048963502049446106
Loss: 0.06678297370672226
Loss: 0.03270254656672478
Loss: 0.04138190671801567
Loss: 0.04990045353770256
Loss: 0.040872711688280106
Loss: 0.061789195984601974
Loss: 0.036595843732357025
Loss: 0.030960869044065475
Loss: 0.04521242901682854
Loss: 0.0346122570335865
Loss: 0.04409269988536835
Loss: 0.049537573009729385
Loss: 0.042189065366983414
Loss: 0.04497317969799042
Loss: 0.05187426134943962
Loss: 0.262341171503067
Loss: 0.04369405284523964
Loss: 0.04598637670278549
Loss: 0.07039165496826172
Loss: 0.044265199452638626
Loss: 0.043945811688899994
Loss: 0.05357125401496887
Loss: 0.049334410578012466
Loss: 0.04373195022344589
Loss: 0.04013403505086899
Loss: 0.0570179745554924
Loss: 0.04441535472869873
Loss: 0.03169548511505127
Loss:

In [13]:
# Save model
model.save_pretrained("./GPTneo-finetuned")
tokenizer.save_pretrained("./GPTneo-finetuned")

('./gpt-neo-finetuned/tokenizer_config.json',
 './gpt-neo-finetuned/special_tokens_map.json',
 './gpt-neo-finetuned/vocab.json',
 './gpt-neo-finetuned/merges.txt',
 './gpt-neo-finetuned/added_tokens.json',
 './gpt-neo-finetuned/tokenizer.json')

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Reload fine-tuned model
model_path = "./GPTneo-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

model.eval()

# Prompt
prompt = (
    "Instruction: Summarize the patient's health metrics for the given day.\n\n"
    "Input:\n"
    "Patient: HUPA0026P | Date: 2020-05-23\n"
    "Glucose (avg/min/max): 106.9/40.0/167.0, Heart rate avg: 86.1, Steps: 20679.0, "
    "Carbs: 0.0, Insulin: 8.0, Events: Meals=0, Activities=137, Hypoglycemia=54, Hyperglycemia=0\n\n"
    "Response:"
)

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=1200,
    do_sample=True,           # allow sampling
    temperature=1,          # control creativity
    repetition_penalty=1.2,   # reduce repetition
    no_repeat_ngram_size=2,   # block bigram repetition
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

# Decode
decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)

# Extract response
if "Response:" in decoded:
    response = decoded.split("Response:", 1)[1].strip()
else:
    response = decoded.strip()

print("Generated summary:\n", response)


Generated summary:
 No statistical differences were found for blood glucose, blood pressure, and heart rate.
