# Memory-Optimized Chat Memory Model Training

Training llama3.1 8B Instruct for chat memory

## Setup Steps:
1. Mount Google Drive
2. Install dependencies
3. Configure memory settings
4. Train model with optimizations
5. Save results

In [None]:
# Install required packages
%%capture
!pip install torch transformers datasets accelerate bitsandbytes trl peft

In [None]:
# Mount Google Drive and setup directories
from google.colab import drive
drive.mount('/content/drive')

# Create project directories
!mkdir -p "/content/drive/MyDrive/chat_memory/models"
!mkdir -p "/content/drive/MyDrive/chat_memory/data"
!mkdir -p "/content/drive/MyDrive/chat_memory/notebooks"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Train model and save both peft and merged to google drive


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import json
import os
from tqdm import tqdm
import gc
from datetime import datetime
from datasets import Dataset

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def setup_model_and_tokenizer():
    print("Setting up model and tokenizer...")

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        use_safetensors=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token

    print("Model and tokenizer loaded successfully")
    return model, tokenizer

def prepare_training_data(data_path, tokenizer, max_length=1024, validation_split=0.1):
    print("Loading and preparing training data...")

    with open(data_path, 'r') as f:
        data = [json.loads(line) for line in f]

    # Split data into train and validation
    val_size = int(len(data) * validation_split)
    train_data = data[:-val_size]
    val_data = data[-val_size:]

    print(f"Total examples: {len(data)}")
    print(f"Training examples: {len(train_data)}")
    print(f"Validation examples: {len(val_data)}")

    def format_data(examples, desc):
        formatted_data = []
        for item in tqdm(examples, desc=desc):
            messages_text = "\n".join([
                f"{msg['timestamp']} | {msg['content']}"
                for msg in item['messages']
            ])

            instruction = """You are a chat summarization assistant. Given a conversation and its date range, create a concise yet comprehensive summary that captures the key points, emotional undertones, and progression of the relationship between participants."""

            input_text = f"""Please summarize the following chat conversation that occurred between {item['start_date']} and {item['end_date']}.

[START DATE]
{item['start_date']}
[END DATE]
{item['end_date']}
[CHAT MESSAGES]
{messages_text}"""

            prompt = f"""<s>[INST] {instruction}

{input_text} [/INST]
[SUMMARY]
{item['summary']}</s>{tokenizer.eos_token}"""

            encoded = tokenizer(
                prompt,
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors=None,
            )

            formatted_data.append({
                "input_ids": encoded["input_ids"],
                "attention_mask": encoded["attention_mask"],
                "labels": encoded["input_ids"].copy()
            })

            if len(formatted_data) == 1 and desc == "Processing training examples":
                print("\nSample prompt length:", len(encoded["input_ids"]))
                print("Sample prompt preview:")
                print(prompt[:500] + "...")

        return Dataset.from_list(formatted_data)

    train_dataset = format_data(train_data, "Processing training examples")
    val_dataset = format_data(val_data, "Processing validation examples")

    return train_dataset, val_dataset

def train_model():
    base_dir = "/content/drive/MyDrive/chat_memory"
    os.makedirs(f"{base_dir}/models", exist_ok=True)
    os.makedirs(f"{base_dir}/logs", exist_ok=True)

    # Initialize model and tokenizer
    model, tokenizer = setup_model_and_tokenizer()

    # Explicitly set padding side
    tokenizer.padding_side = 'right'

    # Properly prepare model for training - simplified parameters
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=True
    )

    # Configure LoRA with adjusted parameters
    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False
    )

    model = get_peft_model(model, lora_config)
    model.config.use_cache = False

    # Prepare datasets
    train_dataset, val_dataset = prepare_training_data(
        f"{base_dir}/data/train.jsonl",
        tokenizer,
        max_length=1024
    )

    # Training arguments with adjusted parameters
    training_args = TrainingArguments(
        output_dir=f"{base_dir}/models/training_checkpoints",
        num_train_epochs=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=32,
        learning_rate=5e-5,
        lr_scheduler_type="cosine_with_restarts",
        warmup_ratio=0.1,
        weight_decay=0.05,
        fp16=True,
        logging_steps=1,
        optim="adamw_torch",
        save_steps=200,
        eval_steps=200,
        save_total_limit=5,
        logging_dir=f"{base_dir}/logs",
        report_to="none",
        remove_unused_columns=True,
        gradient_checkpointing=True,
        eval_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        # Add these parameters
        ddp_find_unused_parameters=False,
        dataloader_pin_memory=False,
        torch_compile=False,  # Disable torch compile
        gradient_checkpointing_kwargs={"use_reentrant": False}  # Important for stability
    )


    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        max_seq_length=1024,
        packing=False
    )

    print("Starting training...")
    try:
        # Enable gradients explicitly
        for param in model.parameters():
            if param.requires_grad:
                param.register_hook(lambda grad: grad.clone())

        trainer.train()

        print("Saving PEFT model...")
        trainer.model.save_pretrained(f"{base_dir}/models/peft_model")

        print("Merging and saving final model...")
        gc.collect()
        torch.cuda.empty_cache()

        # Load base model for merging
        base_model = AutoModelForCausalLM.from_pretrained(
            "unsloth/Meta-Llama-3.1-8B-Instruct",
            device_map=None,
            trust_remote_code=True
        )
        base_model.to("cuda")

        # Load PEFT model
        peft_model = PeftModel.from_pretrained(
            base_model,
            f"{base_dir}/models/peft_model",
        )
        peft_model.to("cuda")

        # Merge and save
        merged_model = peft_model.merge_and_unload()

        merged_model_path = f"{base_dir}/models/merged_model"
        merged_model.save_pretrained(
            merged_model_path,
            safe_serialization=True
        )
        tokenizer.save_pretrained(merged_model_path)

    except Exception as e:
        print(f"An error occurred during training: {str(e)}")
        raise e

In [None]:
train_model()

## Load peft from google drive and merge again
This was primarily for troubleshooting merge process

In [None]:
# Install required packages
!pip install torch transformers peft accelerate bitsandbytes

from google.colab import drive
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc

# Mount Google Drive
drive.mount('/content/drive')

def merge_peft_model():
    base_dir = "/content/drive/MyDrive/chat_memory"

    print("Loading base model...")
    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        device_map="auto",  # Use device_map="auto" for better memory management
        torch_dtype=torch.float16,
        trust_remote_code=True,
        use_safetensors=True
    )

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        trust_remote_code=True
    )

    print("Loading PEFT model...")
    # Load PEFT model
    peft_model = PeftModel.from_pretrained(
        base_model,
        f"{base_dir}/models/peft_model",
        device_map="auto"  # Use device_map="auto" for better memory management
    )

    print("Merging models...")
    # Merge and unload
    merged_model = peft_model.merge_and_unload()

    print("Saving merged model...")
    merged_model_path = f"{base_dir}/models/merged_model_new"

    # Save the merged model with safe serialization
    merged_model.save_pretrained(
        merged_model_path,
        safe_serialization=True,
        max_shard_size="2GB"  # Shard the model for easier downloading
    )

    # Save the tokenizer
    tokenizer.save_pretrained(merged_model_path)

    print(f"Model and tokenizer saved to {merged_model_path}")

    # Clean up memory
    del base_model
    del peft_model
    del merged_model
    gc.collect()
    torch.cuda.empty_cache()

# Run the merge process
merge_peft_model()

# Optional: Create a zip file for easier downloading
!cd "/content/drive/MyDrive/chat_memory/models" && tar -czf merged_model_new.tar.gz merged_model_new/

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading base model...


config.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Loading PEFT model...
Merging models...
Saving merged model...
Model and tokenizer saved to /content/drive/MyDrive/chat_memory/models/merged_model_new
tar: merged_model_new/config.json: file changed as we read it
tar: merged_model_new/generation_config.json: file changed as we read it
tar: merged_model_new/model-00001-of-00009.safetensors: file changed as we read it




tar: merged_model_new/model-00003-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00004-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00005-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00006-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00007-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00008-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model-00009-of-00009.safetensors: file changed as we read it
tar: merged_model_new/model.safetensors.index.

## Deployment \
Upload merged model to HF


In [1]:
import os
import torch
from huggingface_hub import login, HfApi
from transformers import AutoTokenizer
from peft import PeftModel, AutoPeftModelForCausalLM
import gc

# Clear GPU memory first
torch.cuda.empty_cache()
gc.collect()

# Your token
NEW_TOKEN = "hf_XXXX"
REPO_NAME = "FerretMan/chatmemory-llama3.1-8B-peft"

# Login and setup
login(token=NEW_TOKEN, add_to_git_credential=True)

api = HfApi()
user_info = api.whoami(token=NEW_TOKEN)
print(f"Successfully logged in as: {user_info['name']}")

# Create repository
api.create_repo(
    repo_id=REPO_NAME,
    private=True,
    exist_ok=True,
    repo_type="model",
    token=NEW_TOKEN
)
print(f"Repository {REPO_NAME} is ready")

try:
    # Load model with specific device mapping
    model = AutoPeftModelForCausalLM.from_pretrained(
        "/content/drive/MyDrive/chat_memory/models/peft_model",
        device_map={
            "": "cpu"  # Force load on CPU
        },
        torch_dtype=torch.float32,  # Use float32 for stability
        token=NEW_TOKEN,
        low_cpu_mem_usage=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        trust_remote_code=True,
        token=NEW_TOKEN
    )

    # Make sure model is in the correct state for saving
    model.config.use_cache = False

    print("Pushing model to hub...")
    # Push to hub with specific save options
    model.push_to_hub(
        REPO_NAME,
        token=NEW_TOKEN,
        safe_serialization=True,
        max_shard_size="2GB",
        save_on_each_node=True
    )
    print("Model pushed successfully")

    print("Pushing tokenizer to hub...")
    tokenizer.push_to_hub(
        REPO_NAME,
        token=NEW_TOKEN
    )
    print("Tokenizer pushed successfully")

    # Verify files
    files = api.list_repo_files(REPO_NAME, token=NEW_TOKEN)
    print("Files in repository:", files)

except Exception as e:
    print(f"An error occurred during upload: {str(e)}")
    raise

finally:
    # Clean up
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

print("Upload completed successfully!")

KeyboardInterrupt: 

## Load peft model from HF
PEFT is convinient so we have both uploaded

In [None]:
simport torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import login
from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList
import os

# Custom stopping criteria for [/SUMMARY]
class SummaryStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_token_id, device):
        super().__init__()
        self.stop_token_id = stop_token_id
        self.device = device

    def __call__(self, input_ids, scores, **kwargs):
        for seq in input_ids:
            if self.stop_token_id in seq:
                return True
        return False

def test_chat_memory_model(input_text, start_date="2024-01-01", end_date="2024-01-01"):
    # Create offload directory
    offload_dir = "./offload"
    os.makedirs(offload_dir, exist_ok=True)

    # First load the base model
    base_model = AutoModelForCausalLM.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        token=NEW_TOKEN,
        offload_folder=offload_dir,
        offload_state_dict=True
    )

    # Load the base tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "unsloth/Meta-Llama-3.1-8B-Instruct",
        trust_remote_code=True,
        token=NEW_TOKEN
    )

    # Add [/SUMMARY] to tokenizer special tokens
    special_tokens = {"additional_special_tokens": ["[/SUMMARY]"]}
    tokenizer.add_special_tokens(special_tokens)

    # Load your PEFT adapters
    model = PeftModel.from_pretrained(
        base_model,
        "FerretMan/chatmemory-llama3.1-8B-peft",
        token=NEW_TOKEN,
        offload_folder=offload_dir
    )

    instruction = """You are a chat summarization assistant. Given a conversation and its date range, create a concise yet comprehensive summary that captures the key points, emotional undertones, and progression of the relationship between participants."""

    prompt = f"""<s>[INST] {instruction}

Please summarize the following chat conversation that occurred between {start_date} and {end_date}.

[START DATE]
{start_date}
[END DATE]
{end_date}
[CHAT MESSAGES]
{input_text} [/INST]
[SUMMARY]"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Get token ID for [/SUMMARY]
    eos_token_id = tokenizer.convert_tokens_to_ids("[/SUMMARY]")

    # Setup stopping criteria
    stopping_criteria = StoppingCriteriaList([
        SummaryStoppingCriteria(eos_token_id, "cuda")
    ])

    # Generate with reduced memory usage and stopping criteria
    with torch.cuda.amp.autocast():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=eos_token_id,
            stopping_criteria=stopping_criteria
        )

    # Clean up to free memory
    del base_model
    torch.cuda.empty_cache()
    gc.collect()

    # Decode and clean up the output
    summary = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract only the summary part
    try:
        summary = summary.split("[SUMMARY]")[1].split("[/SUMMARY]")[0].strip()
    except IndexError:
        summary = summary.split("[SUMMARY]")[1].strip()

    # Clean up offload directory
    import shutil
    shutil.rmtree(offload_dir, ignore_errors=True)

    return summary


# Your token
NEW_TOKEN = "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"
login(token=NEW_TOKEN)

# Clear GPU memory first
import gc
torch.cuda.empty_cache()
gc.collect()

# Test messages
test_messages = """
2024-01-01 10:00 | Hey, how was your New Year's celebration?
2024-01-01 10:05 | It was great! Watched fireworks with family.
2024-01-01 10:07 | Sounds wonderful! Mine was quiet but nice.
2024-01-01 10:10 | Made any resolutions?
2024-01-01 10:12 | Yes, trying to exercise more and read one book per week.
2024-01-01 10:15 | Those are good ones! I'm focusing on learning photography.
"""

# Run the test
summary = test_chat_memory_model(test_messages)
print("\nGenerated Summary:")
print(summary)

config.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

  with torch.cuda.amp.autocast():


KeyboardInterrupt: 