# DeepSeek Debugging Notebook
This notebook isolates and debugs all DeepSeek-specific logic, including device and padding issues.

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
model_name = 'DeepSeek-R1-Distill-Qwen-1.5B'  # Change as needed
model_dir = '../../models'  # Adjust if needed
device = 'cuda:1' if torch.cuda.device_count() > 1 else 'cuda:0'
print(f'Using device: {device}')

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:1


In [2]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    os.path.join(model_dir, model_name),
    torch_dtype=torch.float32,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, model_name), trust_remote_code=True)
model.to(device)
print('Model loaded and moved to', next(model.parameters()).device)

Model loaded and moved to cuda:1


In [3]:
# DeepSeek/Qwen-specific padding handling
if ('deepseek' in model_name.lower()) or ('deepseek' in getattr(model.config, 'model_type', '').lower()) or ('qwen' in model_name.lower()) or ('qwen' in getattr(model.config, 'model_type', '').lower()):
    if tokenizer.pad_token is None or tokenizer.pad_token_id is None or tokenizer.pad_token_id < 0:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = 'left'
print('pad_token:', tokenizer.pad_token)
print('pad_token_id:', tokenizer.pad_token_id)
print('padding_side:', tokenizer.padding_side)

pad_token: <｜end▁of▁sentence｜>
pad_token_id: 151643
padding_side: left


In [6]:
# Prepare a minimal batch for testing
questions = ["What is the capital of France? Let's think step by step", "What is 2+2? Given one number only."]
inputs = tokenizer(questions, return_tensors='pt', padding='longest', return_token_type_ids=False)
inputs['input_ids'] = inputs['input_ids'].to(device)
if 'attention_mask' in inputs:
    inputs['attention_mask'] = inputs['attention_mask'].to(device)
print('input_ids device:', inputs['input_ids'].device)
if 'attention_mask' in inputs:
    print('attention_mask device:', inputs['attention_mask'].device)

input_ids device: cuda:1
attention_mask device: cuda:1


In [7]:
# Try a minimal generation with verbose error catching
try:
    print('Model device before generation:', next(model.parameters()).device)
    gen_tokens = model.generate(**inputs, max_new_tokens=10, do_sample=False)
    print('Generation successful!')
    print('Generated tokens:', gen_tokens)
    print('Decoded:', tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
except Exception as e:
    import traceback
    print('Generation failed!')
    traceback.print_exc()
    print('Model device:', next(model.parameters()).device)
    print('input_ids device:', inputs['input_ids'].device)
    if 'attention_mask' in inputs:
        print('attention_mask device:', inputs['attention_mask'].device)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Model device before generation: cuda:1
Generation successful!
Generated tokens: tensor([[151646,   3838,    374,    279,   6722,    315,   9625,     30,   6771,
            594,   1744,   3019,    553,   3019,    624,  32313,     11,    773,
            358,   1184,    311,   7071,    700,    279],
        [151643, 151646,   3838,    374,    220,     17,     10,     17,     30,
          16246,    825,   1372,   1172,     13,   2055,     11,    220,     17,
             10,     17,    374,    220,     17,     10]], device='cuda:1')
Decoded: ["What is the capital of France? Let's think step by step.\nOkay, so I need to figure out the", 'What is 2+2? Given one number only. So, 2+2 is 2+']
