# Analysis of Phi-3.5-mini model



In [1]:
import transformers
import torch

model_path = "microsoft/Phi-3.5-mini-instruct"

phi_model = transformers.AutoModelForCausalLM.from_pretrained(model_path, torchscript=True)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

# print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# Prepare test inputs
test_inputs = tokenizer("hello, world!", return_tensors="pt")
print(test_inputs.input_ids.shape)
print(test_inputs.attention_mask.shape)


torch.Size([1, 4])
torch.Size([1, 4])


In [3]:
# test forward pass
phi_output = phi_model.forward(**test_inputs)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


| past_key_values: torch.Size([1, 32, 4, 96])

In [4]:
phi_output[0][:, -1, :]

tensor([[38.6764, 44.1411, 45.2279,  ..., 35.4041, 35.4046, 35.4034]],
       grad_fn=<SliceBackward0>)

In [5]:
# wrap model

class PhiModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)[0]

wrapped_model = PhiModel(phi_model)

class PhiModelwithKVCache(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.register_buffer("keyCache", torch.zeros(torch.Size([1, 32, 4, 96])))
        self.register_buffer("valueCache", torch.zeros(torch.Size([1, 32, 4, 96])))
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask, past_key_values=(self.keyCache, self.valueCache))

kv_wrapped_model = PhiModelwithKVCache(phi_model)

In [7]:
wm_output = wrapped_model(test_inputs.input_ids, test_inputs.attention_mask)

In [8]:
kv_output = kv_wrapped_model(test_inputs.input_ids, test_inputs.attention_mask)

ValueError: not enough values to unpack (expected 2, got 1)

In [12]:
input_ids = torch.zeros((1, 2), dtype=torch.int32)
attention_mask = torch.ones((1, 2), dtype=torch.int32)


In [17]:
traced_model = torch.jit.trace(wrapped_model.eval(), (input_ids, attention_mask))

  if sequence_length != 1:
  if seq_len > self.original_max_position_embeddings:
  ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):


## Convert model to CoreML


In [18]:
import coremltools as ct
import numpy as np

query_length = ct.RangeDim(lower_bound=1, upper_bound=2048, default=1)

inputs = [
    ct.TensorType(name="inputIds", shape=(1, query_length), dtype=np.int32),
    ct.TensorType(name="attentionMask", shape=(1, query_length), dtype=np.int32),
]

outputs = [
    ct.TensorType(name="logits", dtype=np.float16),
]

In [19]:
fp16_mlmodel = ct.convert(
    traced_model.eval(),
    inputs=inputs,
    outputs=outputs,
    source="pytorch",
    minimum_deployment_target=ct.target.iOS18
)

Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/4529 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:   6%|▌         | 256/4529 [00:00<00:01, 2558.08 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  12%|█▏        | 557/4529 [00:00<00:01, 2822.00 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  19%|█▊        | 845/4529 [00:00<00:01, 2847.45 ops/s]Saving value type of int64 into a b

Exception: For an ML Program, extension must be .mlpackage (not .mlprogram). Please see https://coremltools.readme.io/docs/unified-conversion-api#target-conversion-formats to see the difference between neuralnetwork and mlprogram model types.

In [20]:
fp16_mlmodel.save("phi-3.5-mini-instruct-fp16.mlpackage")

## Load ML model and run inference using greedy decoding
